Merge pull request #152 from PaddlePaddle/develop

pull code

Merge pull request #152 from PaddlePaddle/develop
pull code
aa94ec7b · HappyAngel · GitHub · dba636f2 · 73ce5e62 · aa94ec7b
222 changed file
--- a/cmake/bm.cmake
+++ b/cmake/bm.cmake
@@ -23,6 +23,9 @@ if(NOT DEFINED BM_SDK_ROOT)
    endif()
 endif()

+set(BM_SDK_CPLIB_RPATH ${BM_SDK_ROOT}/lib/bmcompiler)
+set(BM_SDK_LIB_RPATH ${BM_SDK_ROOT}/lib/bmnn/pcie)
+
 message(STATUS "BM_SDK_ROOT: ${BM_SDK_ROOT}")
 find_path(BM_SDK_INC NAMES bmruntime_interface.h
  PATHS ${BM_SDK_ROOT}/include/bmruntime NO_DEFAULT_PATH)
@@ -37,43 +40,35 @@ include_directories("${BM_SDK_ROOT}/include/bmcpu")
 include_directories("${BM_SDK_ROOT}/include/bmlog")

 find_library(BM_SDK_RT_LIB NAMES bmrt
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_RT_LIB)
  message(FATAL_ERROR "Can not find bmrt Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmrt Library: ${BM_SDK_RT_LIB}")
-  add_library(bmrt SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmrt PROPERTY IMPORTED_LOCATION ${BM_SDK_RT_LIB})
 endif()

 find_library(BM_SDK_BM_LIB NAMES bmlib
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_BM_LIB)
  message(FATAL_ERROR "Can not find bmlib Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmlib Library: ${BM_SDK_BM_LIB}")
-  add_library(bmlib SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmlib PROPERTY IMPORTED_LOCATION ${BM_SDK_BM_LIB})
 endif()

 find_library(BM_SDK_COMPILER_LIB NAMES bmcompiler
-  PATHS ${BM_SDK_ROOT}/lib/bmcompiler)
+  PATHS ${BM_SDK_CPLIB_RPATH})
 if(NOT BM_SDK_COMPILER_LIB)
  message(FATAL_ERROR "Can not find bmcompiler Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmcompiler Library: ${BM_SDK_COMPILER_LIB}")
-  add_library(bmcompiler SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmcompiler PROPERTY IMPORTED_LOCATION ${BM_SDK_COMPILER_LIB})
 endif()

 find_library(BM_SDK_CPU_LIB NAMES bmcpu
-  PATHS ${BM_SDK_ROOT}/lib/bmnn/pcie)
+  PATHS ${BM_SDK_LIB_RPATH})
 if(NOT BM_SDK_CPU_LIB)
  message(FATAL_ERROR "Can not find bmcpu Library in ${BM_SDK_ROOT}")
 else()
  message(STATUS "Found bmcpu Library: ${BM_SDK_CPU_LIB}")
-  add_library(bmcpu SHARED IMPORTED GLOBAL)
-  set_property(TARGET bmcpu PROPERTY IMPORTED_LOCATION ${BM_SDK_CPU_LIB})
 endif()

 set(bm_runtime_libs bmrt bmlib bmcompiler bmcpu CACHE INTERNAL "bm runtime libs")

--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
@@ -100,7 +100,7 @@ function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
            ${OPT}
            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
-    DEPENDS flatbuffers
+    DEPENDS flatbuffers ${SRC_FBS}
    COMMENT "Run generation: '${GEN_HEADER}'")
  register_generated_output(${GEN_HEADER})
  add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
@@ -108,7 +108,10 @@ endfunction()

 set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
 set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
+set(PARAM_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/param.fbs")
 compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+compile_flatbuffers_schema_to_cpp_opt(param_fbs_header ${PARAM_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
 include_directories(${FLATBUFFERS_INCLUDE_DIR})
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})

+add_custom_target(fbs_headers ALL DEPENDS framework_fbs_header param_fbs_header)
--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
@@ -44,6 +44,8 @@ sh run_benchmark.sh
 3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
 4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。

+> **注意：** 如果运行中遇到`Operation not permitted`的问题，请使用`sudo +sh run_benchmark.sh`给予授权，并尝试重新关闭/打开手机**USB调试**和**文件传输模式**，或者通过USB重新连接手机之后再次运行脚本。
+
 ## 二. 逐步Benchmark

 ### 1. 编译benchmark可执行文件

--- a/docs/demo_guides/android_app_demo.md
+++ b/docs/demo_guides/android_app_demo.md
@@ -36,9 +36,11 @@

 **需要的环境**： Android Studio、Android手机（开启USB调试模式）、下载到本地的[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)工程

+**预先要求**：如果您的Android Studio尚未配置NDK，请根据Android Studio用户指南中的[安装及配置NDK和CMake](https://developer.android.com/studio/projects/install-ndk)内容，预先配置好NDK。您可以选择最新的NDK版本，或者与[Android编译环境配置](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#android)中的NDK版本保持一致。
+
 **部署步骤**：

-1、 目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`
+1、目标检测的Android示例位于 `Paddle-Lite-Demo\PaddleLite-android-demo\object_detection_demo`

 2、用Android Studio 打开object_detection_demo工程 （本步骤需要联网）。

@@ -46,12 +48,17 @@

 ![Android_studio](https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Android_studio.png)

+**注意：** 如果您在导入项目、编译或者运行过程中遇到NDK配置错误的提示，请打开 File > Project Structure > SDK Location，修改 "Andriod NDK location" 为您本机配置的NDK所在路径。如果您是通过Andriod Studio的SDK Tools下载的NDK (见本章节"预先要求")，可以直接点击下拉框选择默认路径。如果以上步骤仍旧无法解决NDK配置错误，请尝试根据Andriod Studio官方文档中的[更新 Android Gradle 插件](https://developer.android.com/studio/releases/gradle-plugin?hl=zh-cn#updating-plugin)章节，尝试更新Android Gradle plugin版本。
+
+<p align="center"><img width="600" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/Andriod_Studio_NDK.png"/>
+
 4、按下 Run按钮，自动编译APP并安装到手机。(该过程会自动下载Paddle-Lite预测库和模型，需要联网)

 成功后效果如下，图一：APP安装到手机        图二： APP打开后的效果，会自动识别图片中的物体并标记

 <p align="center"><img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp0.png"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="300" height="450"  src="https://paddlelite-data.bj.bcebos.com/doc_images/Android_iOS_demo/android/AndroidApp1.jpg"/></p>

+
 ## Android demo结构讲解

 Android 示例的代码结构如下图所示：

--- a/docs/demo_guides/huawei_kirin_npu.md
+++ b/docs/demo_guides/huawei_kirin_npu.md
--- a/docs/demo_guides/mediatek_apu.md
+++ b/docs/demo_guides/mediatek_apu.md
@@ -159,12 +159,18 @@ $ git checkout <release-version-tag>
 $ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
 $ tar -xvf apu_ddk.tar.gz
 ```
- 编译tiny_publish for MT8168-P2V1 Tablet
+- 编译tiny_publish for MT8168-P2V1 Tablet and Smart TVs(S900)
 ```shell
-$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
+For MT8168-P2V1 Tablet
+$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
+
+For Smart TVs(S900)
+$ ./lite/tools/build_android.sh --arch=armv7 --android_stl=c++_shared --with_extra=ON --with_log=ON --with_mediatek_apu=ON --mediatek_apu_sdk_root=./apu_ddk
 ```
 - 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录；
- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/include目录；
+- 将编译生成的build.lite.android.armv7.gcc/inference_lite_lib.android.armv7.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/armeabi-v7a/lib/libpaddle_light_api_shared.so文件。


 ## 其它说明

--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
-# PaddleLite使用NPU(华为)预测部署
-
-Paddle Lite是首款支持华为自研达芬奇架构NPU（Kirin 810/990 SoC搭载的NPU）的预测框架。
-原理是在线分析Paddle模型，将Paddle算子转成HiAI IR后，调用HiAI IR/Builder/Runtime APIs生成并执行HiAI模型。
-
-## 已支持的设备
-
- 华为nova5、nova5i pro、mate30、mate30 pro、mate30 5G、荣耀v30、p40、p40 pro，以及即将推出的mate40、。据华为透露，今后上市的大部分手机都会搭载其自研达芬奇架构NPU。
-
-## 已支持的模型
-
- MobileNetV1
- MobileNetV2
- ResNet-18/50
- ShuffleNetV2
- squeezenet
- mnasnet
- yolov3
- CycleGAN (暂时需要华为内部rom的支持)
- 百度内部业务模型（由于涉密，不方便透露具体细节）
-
-*CPU/NPU混合调度在部分模型可以获得更佳的性能*
-
-## 已支持（或部分支持）的Paddle算子
-
- sigmoid
- relu
- tanh
- relu_clipped
- leaky_relu
- softsign
- hard_sigmoid
- batch_norm
- concat
- conv2d
- depthwise_conv2d
- conv2d_transpose
- dropout
- elementwise_add
- elementwise_sub
- elementwise_mul
- elementwise_div
- fusion_elementwise_add_activation
- fusion_elementwise_sub_activation
- fusion_elementwise_mul_activation
- fusion_elementwise_div_activation
- fc
- bilinear_interp
- nearest_interp
- matmul
- mul
- pad2d
- pool2d
- reduce_mean
- reshape
- reshape2
- scale
- shuffle_channel
- softmax
- split
- sqrt
- square
- transpose
- transpose2
- unsqueeze
- unsqueeze2
- instance_norm (暂时需要华为内部rom的支持)
- layer_norm (暂时需要华为内部rom的支持)
-
-## 编译支持NPU的Paddle Lite库
-
- 从[华为HiAI平台](https://developer.huawei.com/consumer/cn/hiai)下载华为HiAI DDK后解压到任意路径（注意：华为提供了多个版本的DDK，我们需要下载针对麒麟810/990芯片HiAI Foundation开发套件，例如[DDK V310版本](https://obs.cn-north-2.myhwclouds.com/hms-ds-wf/sdk/hwhiai-ddk-100.310.011.010.zip)）。
- 将HiAI DDK中的ai_ddk_lib目录拷贝至Paddle Lite源码根目录后，使用[编译脚本](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/tools/build_android.sh)编译 (需要指定NPU相关选项)。
-
-注意：以下是HiAI DDK V310版解压后的目录结构，需要将ai_ddk_lib目录拷贝至Paddle Lite源码根目录。
-```shell
- app_sample
- ddk
-  - ai_ddk_lib
-    - include
-    - lib # for armv7
-    - lib64 # for armv8
- document
- tools
-```
-
- 推荐编译命令。由于HiAI DDK的so库均基于c++_shared构建，因此，建议使用c++_shared编译Paddle Lite。
-```shell
-# huawei_kirin_npu_sdk_root 需要指向 ai_ddk_lib 的路径
-$ ./lite/tools/build_android.sh --android_stl=c++_shared --with_huawei_kirin_npu=ON --huawei_kirin_npu_sdk_root=<path-to-ai_ddk_lib>
-# 其它选项可以通过 "./lite/tools/build_android.sh help" 查看，例如arm版本等 
-```
-
-注意：为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置，然后再执行上述命令。
-
-## 优化生成NPU模型
-
- model_optimize_tool工具已经支持生成NPU模型，仅需要将valid_targets设置为npu,arm即可，具体参考[模型转化方法](../user_guides/model_optimize_tool)。
-```shell
-./model_optimize_tool --model_dir=<model_param_dir> \
-    --model_file=<model_path> \
-    --param_file=<param_path> \
-    --optimize_out_type=(protobuf|naive_buffer) \
-    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=npu,arm \
-    --record_tailoring_info =(true|false)
-```
- model_optimize_tool生成的模型只是标记了NPU支持的Paddle算子，并没有真正生成NPU HiAI模型，只有在执行时才会将标记的Paddle算子转成HiAI IR，最终生成并执行HiAI模型，具体实现参考PR[2576](https://github.com/PaddlePaddle/Paddle-Lite/pull/2576)。
- 不同模型，不同型号（ROM版本）的华为手机，在执行阶段，由于某些Paddle算子无法完全转成HiAI IR，或目标手机的HiAI版本过低等原因，可能导致HiAI模型无法成功生成，在这种情况下，Paddle Lite会调用CPU版算子进行运算完成整个预测任务。
-
-## 通过JAVA接口加载并执行NPU模型
-
-**注意：由于华为手机root权限限制，现在仅支持JAVA接口加载和执行NPU模型**
-
- 使用方法和[Java实例](java_demo)一致，无需额外设置任何参数，只需将模型换成NPU模型即可。[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中的Image Classification Demo for Android是同时支持CPU和NPU两种模型的图像分类Demo。
-
-注意：在拷贝libpaddle_lite_jni.so的时候，由于依赖HiAI DDK so和libc++_shared.so库，需要将HiAI DDK中ai_ddk_lib/lib或ai_ddk_lib/lib64目录下的所有so和libc++_shared.so，拷到libpaddle_lite_jni.so同级目录下。
-
-## 其它说明
-
- 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
- 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
- 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
-
-
-## 手动分割子图
-
-### 背景
- Paddle-Lite已经支持了大量的华为NPU的算子，但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型，Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图，实现NPU和CPU自动调度的功能，通常情况下可以获得比较好的性能。在一些特殊情况下，模型会被自动划分为比较多的子图，导致CPU和NPU的切换开销很大，从而导致整体性能变差。因此，需要手动分割子图的功能来指定一些算子跑在CPU上，避免子图过多。
-
-### 功能
- 通过配置文件来指定需要强制跑在CPU上的算子
-
-### 使用方法
- 1、通过netron打开paddle模型文件，可以查看模型结构，获得算子的类型、输入名称。输出名称。
-    - 注意：Paddle-Lite会对模型进行优化，模型算子可以改变，需要以优化后的模型算子为准。后面会举例说明。
- 2、生成配置文件 ```split_cfg.txt```，记录需要跑在CPU上的算子信息。
-    - 每行一条OP记录信息，以冒号":"分隔"op名称"，"op输入名"，"op输出名"，以逗号","分隔"op输入名"和"op输出名"中的不同var名。
-    - 可以部分省略输入或者输出名。比如：```op3:in3_var0```表示，指定类型为"op3"，输入为"in3_var0"的算子；```op4```表示所有类型为"op4"的算子
-    - 例子1：
-    ```
-    op0:in0_var0,in0_var1:out0_var0,out0_var1
-    op1:in1_var0,in1_var1:out1_var0
-    op2::out2_var0
-    op3:in3_var0
-    op4
-    ```
-    - 例子2：
-    ```
-    transpose:conv2d_22.tmp_1:transpose_0.tmp_0
-    ```
-    ![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
-
- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
-    - 例如：
-    ```
-    export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
-    ```
- 4、以上步骤完成后，运行的模型中符合条件的算子将被强制跑在CPU上。
-
-### 举例
- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
-
- 1、可以使用netron查看模型
-
- 2、初步分析
-
-    - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行，蓝色部分可能NPU上的性能不理想。此时，如果直接让预测库自动调度的话，可能会分成多个子图，而且整体性能不佳。因此，可以将蓝色部分和绿色部分整体指定在CPU上运行，让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
-    ![](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
-
- 3、使用opt转换模型
-
-    - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
-    ![](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
-    ![](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
-    - 将从```digraph G```开始的，到```// end G```结束的整段模型图信息，保存到```.dot```格式的文件中。可以用```graphviz```打开查看，或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
-    ![](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
-    - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在，如果被融合为了一个算子，需要指定此时融合后的算子)。
-
- 4、写配置文件
-
-    - 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
-    ```
-    reshape
-    transpose
-    concat
-    softmax
-    ```
-    - 由于这些算子都指定在CPU上运行，因此不需要特意配置算子的输入输出名称。
-
- 5、指定配置文件路径
-
-    - 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
-
- 6、性能测试
-
-    - 设备：华为mate30 5G
-    - HIAI ddk版本：320
-    - 性能：CPU约71.8ms，NPU约16.6ms。
-    
--- a/docs/demo_guides/rockchip_npu.md
+++ b/docs/demo_guides/rockchip_npu.md
@@ -137,20 +137,26 @@ $ cd Paddle-Lite
 $ git checkout <release-version-tag>
 $ git clone https://github.com/airockchip/rknpu_ddk.git
 ```
- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
+- 编译tiny_publish and full_publish for RK1808 and RK1806 EVB
 ```shell
 For RK1808 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+tiny_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish

 For RK1806 EVB
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
-$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+tiny_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk
+full_publish
+$ ./lite/tools/build_linux.sh --arch=armv7 --with_extra=ON --with_log=ON --with_rockchip_npu=ON --rockchip_npu_sdk_root=./rknpu_ddk full_publish
 ```
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件；
 - 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
+- 将tiny_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件；
+- 将full_publish模式下编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_full_api_shared.so文件。

 ## 其它说明


--- a/docs/index.rst
+++ b/docs/index.rst
@@ -55,7 +55,7 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/cuda
  demo_guides/opencl
  demo_guides/fpga
-  demo_guides/npu
+  demo_guides/huawei_kirin_npu
  demo_guides/baidu_xpu
  demo_guides/rockchip_npu
  demo_guides/mediatek_apu

--- a/docs/user_guides/Compile/Android.md
+++ b/docs/user_guides/Compile/Android.md
@@ -3,7 +3,7 @@

 **注意：本编译方法只适用于release/v2.6.0之后版本（包括 v2.6.0)**

-安装了Android的编译环境，可以下载并编译 Paddle-Lite源码
+如果您还没有配置好Andriod交叉编译环境，请先根据[环境准备](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#id2)中的内容，根据您的开发环境安装编译Android预测库所需的编译环境。运行编译脚本之前，请先检查环变量`NDK_ROOT`指向正确的Andriod NDK安装路径，之后可以下载并编译 Paddle-Lite源码。

 ```shell
 # 1. 下载Paddle-Lite源码 并切换到release分支
@@ -14,6 +14,7 @@ cd Paddle-Lite && git checkout release/v2.3
 ./lite/tools/build_android.sh
 ```

+> **提示：** 编译过程中，如果程序在下载第三方库时花费较多时间，请尝试删除Paddle-Lite下的`<lite-repo>/third-party`目录之后再次运行编译脚本，脚本会自动下载存储于百度云的第三方库代码包，节省从git repo下载第三方库代码的时间。


 ### 编译结果

--- a/docs/user_guides/opt/opt_bin.md
+++ b/docs/user_guides/opt/opt_bin.md
@@ -3,10 +3,14 @@
 opt是 x86 平台上的可执行文件，需要在PC端运行：支持Linux终端和Mac终端。

 ### 帮助信息
- 执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+
+执行opt时不加入任何输入选项，会输出帮助信息，提示当前支持的选项：
+
 ```bash
 ./opt
 ```
+> **注意：** 如果您是通过[准备opt](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html#id1)页面中，"方法二：下载opt可执行文件" 中提供的链接下载得到的opt可执行文件，请先通过`chmod +x ./opt`命令为下载的opt文件添加可执行权限。
+
 ![](https://paddlelite-data.bj.bcebos.com/doc_images/1.png)

 ### 功能一：转化模型为Paddle-Lite格式

--- a/docs/user_guides/post_quant_with_data.md
+++ b/docs/user_guides/post_quant_with_data.md
@@ -38,7 +38,7 @@
 ### 2.3 配置校准数据生成器

 静态离线量化内部使用异步数据读取的方式读取校准数据，大家只需要根据模型的输入，配置读取数据的sample_generator。sample_generator是Python生成器，**必须每次返回单个样本数据**，会用作`DataLoader.set_sample_generator()`的数据源。
-建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。
+建议参考[异步数据读取文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/data_preparing/static_mode/use_py_reader.html)和本文示例，学习如何配置校准数据生成器。

 ### 2.4 调用静态离线量化


--- a/docs/user_guides/release_lib.md
+++ b/docs/user_guides/release_lib.md

-# 预编译库
+# 预编译库下载

 ## 编译版本介绍


--- a/docs/user_guides/x2paddle.md
+++ b/docs/user_guides/x2paddle.md
 # 模型转换工具 X2Paddle

-X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。
+X2Paddle可以将caffe、tensorflow、onnx模型转换成Paddle支持的模型。目前支持版本为caffe 1.0；tensorflow 1.x，推荐1.4.0；ONNX 1.6.0，OpSet支持 9, 10, 11版本。

 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle)支持将Caffe/TensorFlow模型转换为PaddlePaddle模型。
 支持的模型可参考**X2Paddle模型测试库：**

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -16,7 +16,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
-    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
+    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry fbs_headers)
    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
    if(LITE_WITH_X86)
        add_dependencies(paddle_full_api_shared xxhash)
@@ -72,7 +72,7 @@ else()
            set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
        endif()
        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h fbs_headers)
        if (LITE_WITH_NPU)
            # Need to add HIAI runtime libs (libhiai.so) dependency
            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,7 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
    # Unlike static library, module library has to link target to be able to work
    # as a single .so lib.
    target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
-    add_dependencies(paddle_lite_jni framework_fbs_header)
+    add_dependencies(paddle_lite_jni fbs_headers)
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -32,7 +32,7 @@ else()
    endif()
    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
    target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h fbs_headers)
    if (LITE_WITH_NPU)
        # Need to add HIAI runtime libs (libhiai.so) dependency
        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -25,6 +25,8 @@
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 #include <gflags/gflags.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"

 using paddle::lite::profile::Timer;


--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -39,12 +39,16 @@ USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
+USE_MIR_PASS(lite_match_matrix_activation_fuse_pass);
+USE_MIR_PASS(lite_scales_fuse_pass);
+USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass);
 USE_MIR_PASS(lite_elementwise_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(type_layout_cast_preprocess_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(lite_reshape_fuse_pass);
 USE_MIR_PASS(multi_stream_analysis_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);

--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WIN32)
   target_link_libraries(lite_pybind ${os_dependency_modules})
 else()
   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+   target_sources(lite_pybind PUBLIC ${__lite_cc_files})
 endif(WIN32)

 if (LITE_ON_TINY_PUBLISH)

--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -118,6 +118,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      beam_search.cc
      reduce_max.cc
      sequence_pool.cc
+      sequence_pool_grad.cc
      sequence_expand.cc
      slice.cc
      reduce_mean.cc

--- a/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_depthwise_int8.cc
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -106,6 +106,42 @@ void conv_depthwise_3x3s1_int8(Dtype* dout,
                               int padh,
                               ARMContext* ctx);

+void conv_depthwise_3x3s1_int8_int8_impl(int8_t* dout,
+                                         const int8_t* din,
+                                         const int8_t* weights,
+                                         const float* scale,
+                                         const float* bias,
+                                         bool flag_bias,
+                                         int flag_act,
+                                         float* alpha,
+                                         int num,
+                                         int chin,
+                                         int hin,
+                                         int win,
+                                         int hout,
+                                         int wout,
+                                         int padw,
+                                         int padh,
+                                         ARMContext* ctx);
+
+void conv_depthwise_3x3s1_int8_float_impl(float* dout,
+                                          const int8_t* din,
+                                          const int8_t* weights,
+                                          const float* scale,
+                                          const float* bias,
+                                          bool flag_bias,
+                                          int flag_act,
+                                          float* alpha,
+                                          int num,
+                                          int chin,
+                                          int hin,
+                                          int win,
+                                          int hout,
+                                          int wout,
+                                          int padw,
+                                          int padh,
+                                          ARMContext* ctx);
+
 template <typename Dtype>
 void conv_depthwise_3x3s2_int8(Dtype* dout,
                               const int8_t* din,

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -814,24 +814,52 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
-                              reinterpret_cast<const int8_t*>(din),
-                              reinterpret_cast<const int8_t*>(weights),
-                              scale,
-                              bias,
-                              flag_bias,
-                              flag_act,
-                              alpha,
-                              num,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              h_out,
-                              w_out,
-                              pad_w,
-                              pad_h,
-                              ctx);
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
+        !support_width_type) {
+      conv_depthwise_3x3s1_int8(reinterpret_cast<float*>(dout),
+                                reinterpret_cast<const int8_t*>(din),
+                                reinterpret_cast<const int8_t*>(weights),
+                                scale,
+                                bias,
+                                flag_bias,
+                                flag_act,
+                                alpha,
+                                num,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                h_out,
+                                w_out,
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_float_impl(
+          reinterpret_cast<float*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<float*>(dout),
                              reinterpret_cast<const int8_t*>(din),
@@ -897,24 +925,52 @@ void conv_depthwise_3x3_int8_int8(const void* din,
      alpha[3] = local_alpha;
    }
  }
+  bool support_act_type = flag_act <= 1;
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (param.strides[0] == 1 && param.strides[1] == 1);
+  bool support_width_type = w_in > 9 ? true : false;
  if (stride == 1) {
-    conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
-                              reinterpret_cast<const int8_t*>(din),
-                              reinterpret_cast<const int8_t*>(weights),
-                              scale,
-                              bias,
-                              flag_bias,
-                              flag_act,
-                              alpha,
-                              num,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              h_out,
-                              w_out,
-                              pad_w,
-                              pad_h,
-                              ctx);
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
+        !support_width_type) {
+      conv_depthwise_3x3s1_int8(reinterpret_cast<int8_t*>(dout),
+                                reinterpret_cast<const int8_t*>(din),
+                                reinterpret_cast<const int8_t*>(weights),
+                                scale,
+                                bias,
+                                flag_bias,
+                                flag_act,
+                                alpha,
+                                num,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                h_out,
+                                w_out,
+                                pad_w,
+                                pad_h,
+                                ctx);
+    } else {
+      conv_depthwise_3x3s1_int8_int8_impl(
+          reinterpret_cast<int8_t*>(dout),
+          reinterpret_cast<const int8_t*>(din),
+          reinterpret_cast<const int8_t*>(weights),
+          scale,
+          bias,
+          flag_bias,
+          flag_act,
+          alpha,
+          num,
+          ch_in,
+          h_in,
+          w_in,
+          h_out,
+          w_out,
+          pad_w,
+          pad_h,
+          ctx);
+    }
  } else if (stride == 2) {
    conv_depthwise_3x3s2_int8(reinterpret_cast<int8_t*>(dout),
                              reinterpret_cast<const int8_t*>(din),

--- a/lite/backends/arm/math/fill_bias_relu.cc
+++ b/lite/backends/arm/math/fill_bias_relu.cc
@@ -300,13 +300,15 @@ void fill_bias_act<float>(float* tensor,
      switch (act_param->active_type) {
        case lite_api::ActivationType::kRelu:
          for (int i = 0; i < remain; i++) {
-            *dst = *src >= 0.f ? *src : 0.f;
+            float tmp = (*src + bias_data);
+            *dst = tmp >= 0.f ? tmp : 0.f;
            src++;
            dst++;
          }
        case lite_api::ActivationType::kRelu6:
          for (int i = 0; i < remain; i++) {
-            float tmp = *src >= 0.f ? *src : 0.f;
+            float tmp = (*src + bias_data);
+            tmp = tmp >= 0.f ? tmp : 0.f;
            *dst = tmp <= act_param->Relu_clipped_coef
                       ? tmp
                       : act_param->Relu_clipped_coef;
@@ -315,10 +317,11 @@ void fill_bias_act<float>(float* tensor,
          }
        case lite_api::ActivationType::kLeakyRelu:
          for (int i = 0; i < remain; i++) {
-            if (*src >= 0.f) {
-              *dst = *src;
+            float tmp = (*src + bias_data);
+            if (tmp >= 0.f) {
+              *dst = tmp;
            } else {
-              *dst = *src * act_param->Leaky_relu_alpha;
+              *dst = tmp * act_param->Leaky_relu_alpha;
            }
            src++;
            dst++;
@@ -336,17 +339,24 @@ void fill_bias_act<float>(float* tensor,
      float32x4_t vbias = vdupq_n_f32(bias_data);
      float* src = data + j * channel_size;
      float* dst = data + j * channel_size;
+      if (cnt > 0) {
 #ifdef __aarch64__
-      asm volatile(FILL_BIAS FILL_STORE
-                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
-                   : [vbias] "w"(vbias)
-                   : "memory", "cc", "v0", "v1", "v2", "v3");
+        asm volatile(FILL_BIAS FILL_STORE
+                     :
+                     [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                     : [vbias] "w"(vbias)
+                     : "memory", "cc", "v0", "v1", "v2", "v3");
 #else
-      asm volatile(FILL_BIAS FILL_STORE
-                   : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
-                   : [vbias] "w"(vbias)
-                   : "memory", "cc", "q3", "q4", "q5", "q6");
+        asm volatile(FILL_BIAS FILL_STORE
+                     :
+                     [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+                     : [vbias] "w"(vbias)
+                     : "memory", "cc", "q3", "q4", "q5", "q6");
 #endif
+      }
+      for (int i = 0; i < remain; i++) {
+        *dst = *src + bias_data;
+      }
    }
  }
 }

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -56,6 +56,7 @@
 #include "lite/backends/arm/math/scale.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
+#include "lite/backends/arm/math/sequence_pool_grad.h"
 #include "lite/backends/arm/math/sequence_softmax.h"
 #include "lite/backends/arm/math/sgemm.h"
 #include "lite/backends/arm/math/sgemv.h"

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2193,7 +2193,13 @@ void pooling3x3s2p1_max(const float* din,
    w_unroll_size -= 1;
    w_unroll_remian = wout - w_unroll_size * 4;
  }
-  float32x4_t vmin = vdupq_n_f32(std::numeric_limits<float>::lowest());
+  int w_needed = wout * 2 + 1;
+  int pad_right_ = w_needed - win - pad_bottom;
+  int w_2 = pad_right_ > 0 ? w_unroll_remian : w_unroll_remian + 1;
+  w_2 = w_unroll_size <= 0 ? w_2 - 1 : w_2;
+
+  float minval = std::numeric_limits<float>::lowest();
+  float32x4_t vmin = vdupq_n_f32(minval);

  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
@@ -2232,6 +2238,11 @@ void pooling3x3s2p1_max(const float* din,
              break;
          }
        }
+
+        auto pr0 = dr0;
+        auto pr1 = dr1;
+        auto pr2 = dr2;
+
        int cnt_num = w_unroll_size;
        if (w_unroll_size > 0) {
 #ifdef __aarch64__
@@ -2285,27 +2296,53 @@ void pooling3x3s2p1_max(const float* din,
                "q11",
                "q15");
 #endif
+
          dr0 -= 8;
          dr1 -= 8;
          dr2 -= 8;
-        }
-        // deal with right pad
-        int wstart = w_unroll_size * 4 * S - P;
-        for (int j = 0; j < w_unroll_remian; ++j) {
-          int wend = std::min(wstart + K, win);
-          int st = wstart > 0 ? wstart : 0;
-          float tmp = dr0[0];
-          for (int i = 0; i < wend - st; i++) {
-            tmp = std::max(tmp, dr0[i]);
-            tmp = std::max(tmp, dr1[i]);
+        } else {
+          float tmp = minval;
+          for (int i = 0; i < 2; i++) {
+            tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
            tmp = std::max(tmp, dr2[i]);
          }
-          *(dr_out++) = tmp;
-          dr0 += S - (st - wstart);
-          dr1 += S - (st - wstart);
-          dr2 += S - (st - wstart);
-          wstart += S;
+
+          dr_out[0] = tmp;
+          dr0++;
+          dr1++;
+          dr2++;
+          dr_out++;
+        }
+
+        for (int w = 0; w < w_2 - 1; w += 1) {
+          float32x4_t vr0 = vld1q_f32(dr0);
+          float32x4_t vr1 = vld1q_f32(dr1);
+          float32x4_t vr2 = vld1q_f32(dr2);
+          vr0 = vsetq_lane_f32(minval, vr0, 3);
+          vr1 = vsetq_lane_f32(minval, vr1, 3);
+          vr2 = vsetq_lane_f32(minval, vr2, 3);
+          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
+          vmax1 = vmaxq_f32(vmax1, vr2);
+          float32x2_t vmax2 =
+              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
+          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
+          dr_out[0] = vget_lane_f32(vmax, 0);
+          dr_out++;
+
+          dr0 += 2;
+          dr1 += 2;
+          dr2 += 2;
+        }
+
+        if (pad_right_) {
+          float tmp = minval;
+          for (int i = 1; i < 3; i++) {
+            tmp = std::max(tmp, std::max(pr0[win - i], pr1[win - i]));
+            tmp = std::max(tmp, pr2[win - i]);
+          }
+          dr_out[0] = tmp;
        }
+
        data_out_channel += wout;
      }
    }
@@ -2539,6 +2576,10 @@ void pooling3x3s2p0_max(const float* din,
  int remain = w_unroll_remian - 1;
  int right = wout * 2 + 1 - win;  // if need right pad

+  int w_2 = right > 0 ? w_unroll_remian : w_unroll_remian + 1;
+  w_2 = w_unroll_size <= 0 ? w_2 - 1 : w_2;
+  float minval = std::numeric_limits<float>::lowest();
+
  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
    const float* data_in_batch = data_in + n * chin * size_channel_in;
@@ -2592,18 +2633,24 @@ void pooling3x3s2p0_max(const float* din,
          dr0 -= 8;
          dr1 -= 8;
          dr2 -= 8;
-          int rem = win - (w_unroll_size * 4) * S;
-          int wstart = 0;
-          for (int j = 0; j < w_unroll_remian; ++j) {
-            int wend = std::min(wstart + K, rem);
-            float tmp = dr0[wstart];  // std::numeric_limits<float>::min();
-            for (int i = wstart; i < wend; i++) {
-              tmp = std::max(tmp, dr0[i]);
-              tmp = std::max(tmp, dr1[i]);
-              tmp = std::max(tmp, dr2[i]);
-            }
-            *(dr_out++) = tmp;
-            wstart += S;
+
+          for (int w = 0; w < w_2 - 1; w += 1) {
+            float32x4_t vr0 = vld1q_f32(dr0);
+            float32x4_t vr1 = vld1q_f32(dr1);
+            float32x4_t vr2 = vld1q_f32(dr2);
+            vr0 = vsetq_lane_f32(minval, vr0, 3);
+            vr1 = vsetq_lane_f32(minval, vr1, 3);
+            vr2 = vsetq_lane_f32(minval, vr2, 3);
+            float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
+            vmax1 = vmaxq_f32(vmax1, vr2);
+            float32x2_t vmax2 =
+                vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
+            float32x2_t vmax = vpmax_f32(vmax2, vmax2);
+            dr_out[0] = vget_lane_f32(vmax, 0);
+            dr_out++;
+            dr0 += 2;
+            dr1 += 2;
+            dr2 += 2;
          }
 #else
          asm volatile(

--- a/lite/backends/arm/math/reduce_max.cc
+++ b/lite/backends/arm/math/reduce_max.cc
@@ -46,6 +46,68 @@ void reduce_n<float>(const float* src,
  }
 }

+template <>
+void reduce_first_of_three<float>(
+    const float* src, float* dst, int first_in, int second_in, int third_in) {
+  for (int i = 0; i < second_in; i++) {
+    for (int j = 0; j < third_in; j++) {
+      dst[i * third_in + j] = src[i * third_in + j];
+      for (int k = 1; k < first_in; k++) {
+        dst[i * third_in + j] =
+            src[k * second_in * third_in + i * third_in + j] >
+                    dst[i * third_in + j]
+                ? src[k * second_in * third_in + i * third_in + j]
+                : dst[i * third_in + j];
+      }
+    }
+  }
+}
+
+template <>
+void reduce_second_of_three<float>(
+    const float* src, float* dst, int first_in, int second_in, int third_in) {
+  for (int i = 0; i < first_in; i++) {
+    for (int j = 0; j < third_in; j++) {
+      dst[i * third_in + j] = src[i * second_in * third_in + j];
+      for (int k = 1; k < second_in; k++) {
+        dst[i * third_in + j] =
+            src[i * second_in * third_in + third_in * k + j] >
+                    dst[i * third_in + j]
+                ? src[i * second_in * third_in + third_in * k + j]
+                : dst[i * third_in + j];
+      }
+    }
+  }
+}
+
+template <>
+void reduce_third_of_three<float>(
+    const float* src, float* dst, int first_in, int second_in, int third_in) {
+  for (int i = 0; i < first_in; i++) {
+    for (int j = 0; j < second_in; j++) {
+      dst[i * second_in + j] = src[i * second_in * third_in + j * second_in];
+      for (int k = 0; k < third_in; k++) {
+        dst[i * second_in + j] =
+            src[i * second_in * third_in + j * second_in + k] >
+                    dst[i * second_in + j]
+                ? src[i * second_in * third_in + j * second_in + k]
+                : dst[i * second_in + j];
+      }
+    }
+  }
+}
+
+template <>
+void reduce_all_of_three<float>(
+    const float* src, float* dst, int first_in, int second_in, int third_in) {
+  float max = src[0];
+  int total_element = first_in * second_in * third_in;
+  for (int i = 0; i < total_element; i++) {
+    max = src[i] > max ? src[i] : max;
+  }
+  dst[0] = max;
+}
+
 template <>
 void reduce_c<float>(const float* src,
                     float* dst,

--- a/lite/backends/arm/math/reduce_max.h
+++ b/lite/backends/arm/math/reduce_max.h
@@ -35,6 +35,22 @@ void reduce_c(const T* src,
              int height_in,
              int width_in);

+template <typename T>
+void reduce_all_of_three(
+    const T* src, T* dst, int first_in, int second_in, int third_in);
+
+template <typename T>
+void reduce_first_of_three(
+    const T* src, T* dst, int first_in, int second_in, int third_in);
+
+template <typename T>
+void reduce_second_of_three(
+    const T* src, T* dst, int first_in, int second_in, int third_in);
+
+template <typename T>
+void reduce_third_of_three(
+    const T* src, T* dst, int first_in, int second_in, int third_in);
+
 template <typename T>
 void reduce_h(const T* src,
              T* dst,

--- a/lite/backends/arm/math/sequence_pool.cc
+++ b/lite/backends/arm/math/sequence_pool.cc
@@ -36,70 +36,23 @@ void seq_pool_sum<float>(const float* din,
    const float* din_ptr = din + lod[i] * width;
    float* dout_ptr = dout + i * width;
    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float sum = 0.f;
-      for (int h = 0; h < height; ++h) {
-        sum += din_ptr[h];
-      }
-      *dout_ptr = sum;
-    } else {
-      memcpy(dout_ptr, din_ptr, width * sizeof(float));
-      din_ptr += width;
-      height = height - 1;
-      int cnt_w = width >> 2;
-      int remain_w = width & 3;
-      int cnt_h = height >> 2;
-      int remain_h = height & 3;
-      int stride = width << 2;
-      for (int w = 0; w < cnt_w; w++) {
-        const float* din_ptr0 = din_ptr + w * 4;
-        float32x4_t dout_val = vld1q_f32(dout_ptr);
-        const float* din_ptr1 = din_ptr0 + width;
-        const float* din_ptr2 = din_ptr1 + width;
-        const float* din_ptr3 = din_ptr2 + width;
-        for (int h = 0; h < cnt_h; h++) {
-          float32x4_t din0 = vld1q_f32(din_ptr0);
-          float32x4_t din1 = vld1q_f32(din_ptr1);
-          float32x4_t din2 = vld1q_f32(din_ptr2);
-          float32x4_t din3 = vld1q_f32(din_ptr3);
-          dout_val = vaddq_f32(din0, dout_val);
-          float32x4_t tmp = vaddq_f32(din1, din2);
-          din_ptr0 += stride;
-          din_ptr1 += stride;
-          dout_val = vaddq_f32(din3, dout_val);
-          din_ptr2 += stride;
-          din_ptr3 += stride;
-          dout_val = vaddq_f32(tmp, dout_val);
-        }
-        for (int h = 0; h < remain_h; h++) {
-          float32x4_t din0 = vld1q_f32(din_ptr0);
-          dout_val = vaddq_f32(din0, dout_val);
-          din_ptr0 += width;
-        }
-        vst1q_f32(dout_ptr, dout_val);
-        dout_ptr += 4;
-      }
-      const float* din_ptr00 = din_ptr + cnt_w * 4;
-      for (int w = 0; w < remain_w; w++) {
-        const float* din_ptr0 = din_ptr00 + w;
-        const float* din_ptr1 = din_ptr0 + width;
-        const float* din_ptr2 = din_ptr1 + width;
-        const float* din_ptr3 = din_ptr2 + width;
-        for (int h = 0; h < cnt_h; h++) {
-          *dout_ptr += din_ptr0[0];
-          float tmp = din_ptr1[0] + din_ptr2[0];
-          din_ptr0 += stride;
-          din_ptr1 += stride;
-          *dout_ptr += din_ptr3[0];
-          din_ptr2 += stride;
-          din_ptr3 += stride;
-          *dout_ptr += tmp;
+    if (height > 0) {
+      if (width == 1) {
+        float sum = 0.f;
+        for (int h = 0; h < height; ++h) {
+          sum += din_ptr[h];
        }
-        for (int h = 0; h < remain_h; h++) {
-          *dout_ptr += din_ptr0[0];
-          din_ptr0 += width;
+        *dout_ptr = sum;
+      } else {
+        memcpy(dout_ptr, din_ptr, width * sizeof(float));
+        din_ptr += width;
+        height = height - 1;
+        for (int h = 0; h < height; h++) {
+          for (int w = 0; w < width; ++w) {
+            dout_ptr[w] += din_ptr[w];
+          }
+          din_ptr += width;
        }
-        dout_ptr++;
      }
    }
  }
@@ -177,78 +130,35 @@ void seq_pool_sqrt<float>(const float* din,
 template <>
 void seq_pool_max<float>(const float* din,
                         float* dout,
+                         int64_t* index,
                         const std::vector<uint64_t> lod,
                         int64_t width) {
  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
    const float* din_ptr = din + lod[i] * width;
    float* dout_ptr = dout + i * width;
+    int64_t* index_ptr = index + i * width;
    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
    if (height > 0) {
      if (width == 1) {
        float max = -std::numeric_limits<float>::max();
+        int64_t max_index = -1;
        for (int h = 0; h < height; ++h) {
          max = std::max(max, din_ptr[h]);
+          max_index = max >= din_ptr[h] ? h : max_index;
        }
        *dout_ptr = max;
+        *index_ptr = max_index;
      } else {
        memcpy(dout_ptr, din_ptr, width * sizeof(float));
+        memset(index_ptr, 0, width * sizeof(int64_t));
        din_ptr += width;
-        height = height - 1;
-        int cnt_w = width >> 2;
-        int remain_w = width & 3;
-        int cnt_h = height >> 2;
-        int remain_h = height & 3;
-        int stride = width << 2;
-        for (int w = 0; w < cnt_w; w++) {
-          const float* din_ptr0 = din_ptr + w * 4;
-          float32x4_t dout_val = vld1q_f32(dout_ptr);
-          const float* din_ptr1 = din_ptr0 + width;
-          const float* din_ptr2 = din_ptr1 + width;
-          const float* din_ptr3 = din_ptr2 + width;
-          for (int h = 0; h < cnt_h; h++) {
-            float32x4_t din0 = vld1q_f32(din_ptr0);
-            float32x4_t din1 = vld1q_f32(din_ptr1);
-            float32x4_t din2 = vld1q_f32(din_ptr2);
-            float32x4_t din3 = vld1q_f32(din_ptr3);
-            dout_val = vmaxq_f32(din0, dout_val);
-            float32x4_t tmp = vmaxq_f32(din1, din2);
-            din_ptr0 += stride;
-            din_ptr1 += stride;
-            dout_val = vmaxq_f32(din3, dout_val);
-            din_ptr2 += stride;
-            din_ptr3 += stride;
-            dout_val = vmaxq_f32(tmp, dout_val);
-          }
-          for (int h = 0; h < remain_h; h++) {
-            float32x4_t din0 = vld1q_f32(din_ptr0);
-            dout_val = vmaxq_f32(din0, dout_val);
-            din_ptr0 += width;
-          }
-          vst1q_f32(dout_ptr, dout_val);
-          dout_ptr += 4;
-        }
-        const float* din_ptr00 = din_ptr + cnt_w * 4;
-        for (int w = 0; w < remain_w; w++) {
-          const float* din_ptr0 = din_ptr00 + w;
-          const float* din_ptr1 = din_ptr0 + width;
-          const float* din_ptr2 = din_ptr1 + width;
-          const float* din_ptr3 = din_ptr2 + width;
-          for (int h = 0; h < cnt_h; h++) {
-            *dout_ptr += din_ptr0[0];
-            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
-            float tmp = std::max(din_ptr1[0], din_ptr2[0]);
-            din_ptr0 += stride;
-            din_ptr1 += stride;
-            *dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
-            din_ptr2 += stride;
-            din_ptr3 += stride;
-            *dout_ptr = std::max(*dout_ptr, tmp);
-          }
-          for (int h = 0; h < remain_h; h++) {
-            *dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
-            din_ptr0 += width;
+        int remain_h = height - 1;
+        for (int h = 0; h < remain_h; h++) {
+          for (int w = 0; w < width; w++) {
+            dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
+            index_ptr[w] = dout_ptr[w] > din_ptr[w] ? index_ptr[w] : h;
          }
-          dout_ptr++;
+          din_ptr += width;
        }
      }
    }
@@ -258,26 +168,33 @@ void seq_pool_max<float>(const float* din,
 template <>
 void seq_pool_min<float>(const float* din,
                         float* dout,
+                         int64_t* index,
                         const std::vector<uint64_t> lod,
                         int64_t width) {
  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
    const float* din_ptr = din + lod[i] * width;
    float* dout_ptr = dout + i * width;
+    int64_t* index_ptr = index + i * width;
    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
    if (height > 0) {
      if (width == 1) {
        float min = std::numeric_limits<float>::max();
+        int64_t min_index = -1;
        for (int h = 0; h < height; ++h) {
          min = std::min(min, din_ptr[h]);
+          min_index = min >= din_ptr[h] ? h : min_index;
        }
        *dout_ptr = min;
+        *index_ptr = min_index;
      } else {
        memcpy(dout_ptr, din_ptr, width * sizeof(float));
+        memset(index_ptr, 0, width * sizeof(int64_t));
        din_ptr += width;
        int remain_h = height - 1;
        for (int h = 0; h < remain_h; h++) {
          for (int w = 0; w < width; w++) {
            dout_ptr[w] = std::min(dout_ptr[w], din_ptr[w]);
+            index_ptr[w] = dout_ptr[w] < din_ptr[w] ? index_ptr[w] : h;
          }
          din_ptr += width;
        }

--- a/lite/backends/arm/math/sequence_pool.h
+++ b/lite/backends/arm/math/sequence_pool.h
@@ -42,12 +42,14 @@ void seq_pool_sqrt(const T* din,
 template <typename T>
 void seq_pool_max(const T* din,
                  T* dout,
+                  int64_t* index,
                  const std::vector<uint64_t> lod,
                  int64_t width);

 template <typename T>
 void seq_pool_min(const T* din,
                  T* dout,
+                  int64_t* index,
                  const std::vector<uint64_t> lod,
                  int64_t width);


--- a/lite/backends/arm/math/sequence_pool_grad.cc
+++ b/lite/backends/arm/math/sequence_pool_grad.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/sequence_pool_grad.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void seq_pool_sum_grad<float>(const float* din,
+                              const float* dout_grad,
+                              float* din_grad,
+                              const std::vector<uint64_t> lod,
+                              int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; i++) {
+    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
+    const float* dout_grad_ptr = dout_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    if (height > 0) {
+      if (width == 1) {
+        for (int h = 0; h < height; ++h) {
+          din_grad_ptr[h] = dout_grad_ptr[h];
+        }
+      } else {
+        for (int h = 0; h < height; h++) {
+          for (int w = 0; w < width; w++) {
+            din_grad_ptr[w] = dout_grad_ptr[w];
+          }
+          din_grad_ptr += width;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void seq_pool_average_grad<float>(const float* din,
+                                  const float* dout_grad,
+                                  float* din_grad,
+                                  const std::vector<uint64_t> lod,
+                                  int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
+    const float* dout_grad_ptr = dout_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    float alpha = 1.0 / height;
+    if (height > 0) {
+      if (width == 1) {
+        float sum = 0.f;
+        for (int h = 0; h < height; ++h) {
+          din_grad_ptr[h] = alpha * dout_grad_ptr[h];
+        }
+      } else {
+        for (int h = 0; h < height; h++) {
+          for (int w = 0; w < width; w++) {
+            din_grad_ptr[w] = alpha * dout_grad_ptr[w];
+          }
+          din_grad_ptr += width;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void seq_pool_sqrt_grad<float>(const float* din,
+                               const float* dout_grad,
+                               float* din_grad,
+                               const std::vector<uint64_t> lod,
+                               int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
+    const float* dout_grad_ptr = dout_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    float alpha = 1.0 / sqrtf(height);
+    if (height > 0) {
+      if (width == 1) {
+        float sum = 0.f;
+        for (int h = 0; h < height; ++h) {
+          din_grad_ptr[h] = alpha * dout_grad_ptr[h];
+        }
+      } else {
+        for (int h = 0; h < height; h++) {
+          for (int w = 0; w < width; w++) {
+            din_grad_ptr[w] = alpha * dout_grad_ptr[w];
+          }
+          din_grad_ptr += width;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void seq_pool_max_grad<float>(const float* din,
+                              const float* dout_grad,
+                              const int64_t* index_grad,
+                              float* din_grad,
+                              const std::vector<uint64_t> lod,
+                              int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+    int64_t height = lod[i + 1] - lod[i];
+    const float* dout_grad_ptr = dout_grad + i * width;
+    const int64_t* index_grad_ptr = index_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    if (height > 0) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          if (h == index_grad_ptr[w]) {
+            din_grad_ptr[w] = dout_grad_ptr[w];
+          } else {
+            din_grad_ptr[w] = 0.f;
+          }
+        }
+        din_grad_ptr += width;
+      }
+    }
+  }
+}
+
+template <>
+void seq_pool_first_grad<float>(const float* din,
+                                const float* dout_grad,
+                                float* din_grad,
+                                const std::vector<uint64_t> lod,
+                                int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+    int64_t height = lod[i + 1] - lod[i];
+    const float* dout_grad_ptr = dout_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    if (height > 0) {
+      for (int w = 0; w < width; w++) {
+        din_grad_ptr[w] = dout_grad_ptr[w];
+      }
+      din_grad_ptr += width;
+      for (int h = 1; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          din_grad_ptr[w] = 0.f;
+        }
+        din_grad_ptr += width;
+      }
+    }
+  }
+}
+
+template <>
+void seq_pool_last_grad<float>(const float* din,
+                               const float* dout_grad,
+                               float* din_grad,
+                               const std::vector<uint64_t> lod,
+                               int64_t width) {
+  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+    int64_t height = lod[i + 1] - lod[i];
+    const float* dout_grad_ptr = dout_grad + i * width;
+    float* din_grad_ptr = din_grad + lod[i] * width;
+    if (height > 0) {
+      for (int h = 0; h < height - 1; h++) {
+        for (int w = 0; w < width; w++) {
+          din_grad_ptr[w] = 0.f;
+        }
+        din_grad_ptr += width;
+      }
+      // last
+      for (int w = 0; w < width; w++) {
+        din_grad_ptr[w] = dout_grad_ptr[w];
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/sequence_pool_grad.h
+++ b/lite/backends/arm/math/sequence_pool_grad.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void seq_pool_sum_grad(const T* din,
+                       const T* dout_grad,
+                       T* din_grad,
+                       const std::vector<uint64_t> lod,
+                       int64_t width);
+
+template <typename T>
+void seq_pool_average_grad(const T* din,
+                           const T* dout_grad,
+                           T* din_grad,
+                           const std::vector<uint64_t> lod,
+                           int64_t width);
+
+template <typename T>
+void seq_pool_sqrt_grad(const T* din,
+                        const T* dout_grad,
+                        T* din_grad,
+                        const std::vector<uint64_t> lod,
+                        int64_t width);
+
+template <typename T>
+void seq_pool_max_grad(const T* din,
+                       const T* dout_grad,
+                       const int64_t* index_grad,
+                       T* din_grad,
+                       const std::vector<uint64_t> lod,
+                       int64_t width);
+
+template <typename T>
+void seq_pool_first_grad(const T* din,
+                         const T* dout_grad,
+                         T* din_grad,
+                         const std::vector<uint64_t> lod,
+                         int64_t width);
+
+template <typename T>
+void seq_pool_last_grad(const T* din,
+                        const T* dout_grad,
+                        T* din_grad,
+                        const std::vector<uint64_t> lod,
+                        int64_t width);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/slice.cc
+++ b/lite/backends/arm/math/slice.cc
@@ -79,6 +79,13 @@ void slice(const Dtype* input,
  }
 }

+template void slice(const float* input,
+                    std::vector<int64_t> dims,
+                    std::vector<int> axes,
+                    std::vector<int> starts,
+                    std::vector<int> ends,
+                    float* out,
+                    Context<TARGET(kARM)>* ctx);
 template void slice(const int* input,
                    std::vector<int64_t> dims,
                    std::vector<int> axes,
@@ -86,12 +93,12 @@ template void slice(const int* input,
                    std::vector<int> ends,
                    int* out,
                    Context<TARGET(kARM)>* ctx);
-template void slice(const float* input,
+template void slice(const int64_t* input,
                    std::vector<int64_t> dims,
                    std::vector<int> axes,
                    std::vector<int> starts,
                    std::vector<int> ends,
-                    float* out,
+                    int64_t* out,
                    Context<TARGET(kARM)>* ctx);

 }  // namespace math

--- a/lite/backends/arm/math/split.cc
+++ b/lite/backends/arm/math/split.cc
@@ -51,11 +51,11 @@ void split_cpy<float>(const float* din, float* dout, int num) {
  }
 }

-template <>
-void split<float>(const float* din,
-                  const std::vector<lite::Tensor*>& dout,
-                  const int axis,
-                  const std::vector<int>& in_strides) {
+template <typename T>
+void split(const T* din,
+           const std::vector<lite::Tensor*>& dout,
+           const int axis,
+           const std::vector<int>& in_strides) {
  int input_offset = 0;
  for (auto out : dout) {
    auto out_dim = out->dims();
@@ -65,15 +65,15 @@ void split<float>(const float* din,
      out_strides[i] = out_strides[i + 1] * out_dim[i];
    }

-    float* out_data = out->mutable_data<float>();
+    T* out_data = out->mutable_data<T>();
    int before = out_strides[0] / out_strides[axis];
    int in_after = in_strides[axis];
    int out_after = out_strides[axis];

-    const float* din_ptr = din + input_offset;
+    const T* din_ptr = din + input_offset;

    for (int i = 0; i < before; ++i) {
-      std::memcpy(out_data, din_ptr, sizeof(float) * out_after);
+      std::memcpy(out_data, din_ptr, sizeof(T) * out_after);
      din_ptr += in_after;
      out_data += out_after;
    }
@@ -81,6 +81,15 @@ void split<float>(const float* din,
  }
 }

+template void split(const float* din,
+                    const std::vector<lite::Tensor*>& dout,
+                    const int axis,
+                    const std::vector<int>& in_strides);
+template void split(const int64_t* din,
+                    const std::vector<lite::Tensor*>& dout,
+                    const int axis,
+                    const std::vector<int>& in_strides);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/backends/bm/CMakeLists.txt
+++ b/lite/backends/bm/CMakeLists.txt
@@ -2,4 +2,5 @@ if (NOT LITE_WITH_BM)
    return()
 endif()

-lite_cc_library(target_wrapper_bm SRCS target_wrapper.cc DEPS ${bm_runtime_libs})
+add_library(target_wrapper_bm STATIC target_wrapper.cc)
+target_link_libraries(target_wrapper_bm -Wl,-rpath,${BM_SDK_CPLIB_RPATH}:${BM_SDK_LIB_RPATH} -L${BM_SDK_CPLIB_RPATH} -L${BM_SDK_LIB_RPATH} -lbmcompiler -lbmcpu -lbmlib -lbmrt)
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -20,6 +20,7 @@ nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
 nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
 nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
+nv_library(cuda_sequence_helper SRCS sequence_helper.cu DEPS ${cuda_static_deps})

 set (
 math_cuda
@@ -39,6 +40,7 @@ set (
 cuda_sequence_padding
 cuda_bias
 cudnn_helper
+ cuda_sequence_helper
 )

 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -55,31 +55,32 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,

  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
                                         CUDNN_TENSOR_NCHW,
-                                         GetCudnnDataType<Ptype_out>(),
+                                         cudnn::cudnnTypeWrapper<T>::type,
                                         batch,
                                         ic,
                                         ih,
                                         iw));
  CUDNN_CHECK(cudnnSetFilter4dDescriptor(this->filter_desc_,
-                                         GetCudnnDataType<Ptype_out>(),
+                                         cudnn::cudnnTypeWrapper<T>::type,
                                         CUDNN_TENSOR_NCHW,
                                         oc,
                                         ic / param.groups,
                                         kh,
                                         kw));
-  CUDNN_CHECK(cudnnSetConvolution2dDescriptor(this->conv_desc_,
-                                              ph,
-                                              pw,
-                                              sh,
-                                              sw,
-                                              dh,
-                                              dw,
-                                              CUDNN_CROSS_CORRELATION,
-                                              GetCudnnDataType<Ptype_out>()));
+  CUDNN_CHECK(
+      cudnnSetConvolution2dDescriptor(this->conv_desc_,
+                                      ph,
+                                      pw,
+                                      sh,
+                                      sw,
+                                      dh,
+                                      dw,
+                                      CUDNN_CROSS_CORRELATION,
+                                      cudnn::cudnnTypeWrapper<T>::type));
  CUDNN_CHECK(cudnnSetConvolutionGroupCount(this->conv_desc_, param.groups));
  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
                                         CUDNN_TENSOR_NCHW,
-                                         GetCudnnDataType<Ptype_out>(),
+                                         cudnn::cudnnTypeWrapper<T>::type,
                                         batch,
                                         oc,
                                         oh,
@@ -179,7 +180,7 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
    int dim_bias[] = {1, oc, 1, 1};
    int stride_bias[] = {oc, 1, 1, 1};
    cudnnSetTensorNdDescriptor(this->bias_desc_,
-                               GetCudnnDataType<Ptype_out>(),
+                               cudnn::cudnnTypeWrapper<T>::type,
                               4,
                               dim_bias,
                               stride_bias);

--- a/lite/backends/cuda/math/cudnn_helper.cc
+++ b/lite/backends/cuda/math/cudnn_helper.cc
@@ -21,17 +21,7 @@ namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
-
-template <>
-cudnnDataType_t GetCudnnDataType<PRECISION(kFloat)>() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<PRECISION(kFP16)>() {
-  return CUDNN_DATA_HALF;
-}
-
+namespace cudnn {}  // namespace cudnn
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/cudnn_helper.h
+++ b/lite/backends/cuda/math/cudnn_helper.h
@@ -25,10 +25,97 @@ namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
+namespace cudnn {

-template <lite_api::PrecisionType PType>
-cudnnDataType_t GetCudnnDataType();
+template <typename T>
+class cudnnTypeWrapper;

+template <>
+class cudnnTypeWrapper<float> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+  typedef const float ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0f;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0f;
+    return &v;
+  }
+};
+
+template <>
+class cudnnTypeWrapper<half> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_HALF;
+  typedef const half ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = __float2half(1.0f);
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = __float2half(0.0f);
+    return &v;
+  }
+};
+
+struct ParamsRegion {
+  ParamsRegion() : offset_(nullptr), size_(0) {}
+  ParamsRegion(void* offset, size_t size) : offset_(offset), size_(size) {}
+  ~ParamsRegion() {}
+
+  ParamsRegion& operator=(const ParamsRegion& right) {
+    offset_ = right.offset_;
+    size_ = right.size_;
+    return *this;
+  }
+
+  bool operator==(const ParamsRegion& right) {
+    bool comp_eq = true;
+    comp_eq = comp_eq && (offset_ == right.offset_);
+    comp_eq = comp_eq && (size_ = right.size_);
+    return comp_eq;
+  }
+
+  void* offset_;
+  size_t size_;
+};
+
+template <typename T>
+class TensorDescriptors {
+ public:
+  TensorDescriptors(size_t n,
+                    const std::vector<std::vector<int>>& dim,
+                    const std::vector<std::vector<int>>& stride) {
+    descs_.resize(n);
+    CHECK_EQ(dim.size(), stride.size())
+        << "dim size should be equal to stride size";
+    for (size_t i = 0; i < n; ++i) {
+      CUDNN_CHECK(cudnnCreateTensorDescriptor(&descs_[i]));
+      CUDNN_CHECK(cudnnSetTensorNdDescriptor(descs_[i],
+                                             cudnnTypeWrapper<T>::type,
+                                             dim[i].size(),
+                                             dim[i].data(),
+                                             stride[i].data()));
+    }
+  }
+
+  ~TensorDescriptors() {
+    for (auto desc : descs_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc));
+    }
+  }
+
+  const cudnnTensorDescriptor_t* descs() const { return descs_.data(); }
+
+  int size() const { return descs_.size(); }
+
+ private:
+  std::vector<cudnnTensorDescriptor_t> descs_;
+};
+
+}  // namespace cudnn
 }  // namespace math
 }  // namespace cuda
 }  // namespace lite

--- a/lite/backends/cuda/math/cudnn_softmax.cc
+++ b/lite/backends/cuda/math/cudnn_softmax.cc
@@ -54,7 +54,7 @@ bool CudnnSoftmax<T, Ptype>::Create(const operators::SoftmaxParam& param,
  const int stride_c = H * stride_h;
  const int stride_n = C * stride_c;
  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(bottom_desc_,
-                                           GetCudnnDataType<Ptype>(),
+                                           cudnn::cudnnTypeWrapper<T>::type,
                                           N,
                                           C,
                                           H,
@@ -64,7 +64,7 @@ bool CudnnSoftmax<T, Ptype>::Create(const operators::SoftmaxParam& param,
                                           stride_h,
                                           stride_w));
  CUDNN_CHECK(cudnnSetTensor4dDescriptorEx(top_desc_,
-                                           GetCudnnDataType<Ptype>(),
+                                           cudnn::cudnnTypeWrapper<T>::type,
                                           N,
                                           C,
                                           H,

--- a/lite/backends/cuda/math/sequence2batch.cu
+++ b/lite/backends/cuda/math/sequence2batch.cu
@@ -30,17 +30,12 @@ __global__ void CopyMatrixRowsKernel(const T* src,
                                     int height,
                                     int width,
                                     bool is_src_index) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int row_id = blockDim.y * blockIdx.x + idy;
-  if (row_id < height) {
-    int src_idx = is_src_index ? index[row_id] : row_id;
-    int dst_idx = is_src_index ? row_id : index[row_id];
-    const T* src_data = src + src_idx * width;
-    T* dst_data = dst + dst_idx * width;
-    for (int i = idx; i < width; i += blockDim.x) {
-      dst_data[i] = src_data[i];
-    }
+  CUDA_KERNEL_LOOP(tid, height * width) {
+    int row = tid / width;
+    int idx = tid % width;
+    int src_row = is_src_index ? index[row] : row;
+    int dst_row = is_src_index ? row : index[row];
+    dst[dst_row * width + idx] = src[src_row * width + idx];
  }
 }

@@ -69,9 +64,8 @@ void CopyMatrixRowsFunctor<T>::operator()(
                                 sizeof(uint64_t) * index_lod.size(),
                                 IoDirection::HtoD,
                                 stream);
-  dim3 threads(128, 8);
-  dim3 grids((height + threads.y - 1) / threads.y);
-  CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
+  CopyMatrixRowsKernel<
+      T><<<CUDA_GET_BLOCKS(height * width), CUDA_NUM_THREADS, 0, stream>>>(
      src_data, dst_data, index_tensor_data, height, width, is_src_index);
  CUDA_POST_KERNEL_CHECK;
 }

--- a/lite/backends/cuda/math/sequence_helper.cu
+++ b/lite/backends/cuda/math/sequence_helper.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence_helper.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename Dtype>
+__global__ void Map2Out(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[map[seq] * lastdim + tid % lastdim] = input[tid];
+  }
+}
+
+template <typename Dtype>
+__global__ void Map2In(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[tid] = input[map[seq] * lastdim + tid % lastdim];
+  }
+}
+
+template <typename Dtype>
+void Map2OutFunc(const Dtype* input,
+                 Dtype* output,
+                 int word_size,
+                 int seq_sum,
+                 cudaStream_t stream,
+                 int* dev_map_vec) {
+  int count = seq_sum * word_size;
+  int block_dim = count;
+  int grid_dim = 1;
+
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
+  }
+
+  Map2Out<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, word_size);
+}
+
+template <typename Dtype>
+void Map2InFunc(const Dtype* input,
+                Dtype* output,
+                int hidden_size,
+                int seq_sum,
+                cudaStream_t stream,
+                int* dev_map_vec) {
+  int count = seq_sum * hidden_size;
+  int block_dim = count;
+  int grid_dim = 1;
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
+  }
+
+  Map2In<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, hidden_size);
+}
+
+template <typename Dtype>
+void SeqSortedseqTranseUtil::Seq2SortedSeq(const Dtype* input,
+                                           Dtype* output,
+                                           int word_size,
+                                           cudaStream_t stream) {
+  int seq_sum = map_vec_.size();
+  Map2OutFunc(input, output, word_size, seq_sum, stream, dev_map_vec_);
+}
+
+template <typename Dtype>
+void SeqSortedseqTranseUtil::SortedSeq2Seq(const Dtype* input,
+                                           Dtype* output,
+                                           int hidden_size,
+                                           cudaStream_t stream) {
+  int seq_sum = map_vec_.size();
+  Map2InFunc(input, output, hidden_size, seq_sum, stream, dev_map_vec_);
+}
+
+bool SeqSortedseqTranseUtil::GetSortedMap(const std::vector<int>& offset_vec,
+                                          cudaStream_t stream_id) {
+  int batch_size = offset_vec.size() - 1;
+  int word_sum = offset_vec[offset_vec.size() - 1];
+  std::vector<int> length_vec(batch_size);
+  length_index_.resize(batch_size);
+  int emit_length = 0;
+
+  if (batch_size == 1) {
+    emit_length = offset_vec[1] - offset_vec[0];
+    emit_offset_vec_.resize(emit_length + 1);
+
+    for (int i = 0; i <= emit_length; ++i) {
+      emit_offset_vec_[i] = i;
+    }
+
+    return false;
+  }
+
+  int max_len = 0;
+
+  for (int i = 0; i < offset_vec.size() - 1; ++i) {
+    int len = offset_vec[i + 1] - offset_vec[i];
+    max_len = max_len > len ? max_len : len;
+    length_vec[i] = len;
+    length_index_[i] = i;
+  }
+
+  emit_length = max_len;
+
+  if (max_len == 1) {
+    emit_offset_vec_.resize(2);
+    emit_offset_vec_[0] = 0;
+    emit_offset_vec_[1] = emit_length * batch_size;
+    return false;
+  }
+
+  std::stable_sort(length_index_.begin(),
+                   length_index_.end(),
+                   [&length_vec](int i1, int i2) {
+                     return length_vec[i1] > length_vec[i2];
+                   });
+
+  emit_offset_vec_.resize(max_len + 1);
+  map_vec_.resize(word_sum);
+
+  if (word_sum > dev_map_vec_length_) {
+    if (dev_map_vec_ != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(dev_map_vec_));
+    }
+
+    dev_map_vec_ =
+        static_cast<int*>(TargetWrapperCuda::Malloc(sizeof(int) * word_sum));
+    dev_map_vec_length_ = word_sum;
+  }
+
+  int target_word_id = 0;
+  std::vector<int> length_vec_cnt = length_vec;
+  int last_batch_size = batch_size;
+  for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) {
+    emit_offset_vec_[word_id_in_seq] = target_word_id;
+
+    for (int batch_id = 0; batch_id < last_batch_size; batch_id++) {
+      int old_batch_id = length_index_[batch_id];
+
+      if (length_vec_cnt[old_batch_id] > 0) {
+        int inner_word_id_in_seq = word_id_in_seq;
+
+        if (is_reverse_) {
+          inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq;
+        }
+
+        int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq;
+        map_vec_[old_word_id] = target_word_id;
+        length_vec_cnt[old_batch_id]--;
+        target_word_id++;
+      } else {
+        last_batch_size--;
+        break;
+      }
+    }
+  }
+
+  TargetWrapperCuda::MemcpyAsync(dev_map_vec_,
+                                 map_vec_.data(),
+                                 sizeof(int) * word_sum,
+                                 IoDirection::HtoD,
+                                 stream_id);
+  emit_offset_vec_[max_len] = word_sum;
+  emit_length_ = emit_length;
+  return true;
+}
+
+template void SeqSortedseqTranseUtil::Seq2SortedSeq(const float* input,
+                                                    float* output,
+                                                    int word_size,
+                                                    cudaStream_t stream);
+template void SeqSortedseqTranseUtil::SortedSeq2Seq(const float* input,
+                                                    float* output,
+                                                    int hidden_size,
+                                                    cudaStream_t stream);
+template void SeqSortedseqTranseUtil::Seq2SortedSeq(const half* input,
+                                                    half* output,
+                                                    int word_size,
+                                                    cudaStream_t stream);
+template void SeqSortedseqTranseUtil::SortedSeq2Seq(const half* input,
+                                                    half* output,
+                                                    int hidden_size,
+                                                    cudaStream_t stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/math/sequence_helper.h
+++ b/lite/backends/cuda/math/sequence_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+#include "lite/backends/cuda/target_wrapper.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+class SeqSortedseqTranseUtil {
+ public:
+  explicit SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false)
+      : is_reverse_(is_reverse),
+        is_bi_(is_bi),
+        dev_map_vec_(nullptr),
+        dev_map_vec_length_(0) {}
+
+  ~SeqSortedseqTranseUtil() {
+    if (dev_map_vec_ != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(dev_map_vec_));
+    }
+  }
+
+  std::vector<int>& GetLengthIndex() { return length_index_; }
+  std::vector<int>& GetEmitOffsetVec() { return emit_offset_vec_; }
+  std::vector<int>& GetMapVec() { return map_vec_; }
+  int* GetDevMapVec() { return dev_map_vec_; }
+  int GetEmitLength() { return emit_length_; }
+
+  template <typename Dtype>
+  void Seq2SortedSeq(const Dtype* input,
+                     Dtype* output,
+                     int word_size,
+                     cudaStream_t stream);
+
+  template <typename Dtype>
+  void SortedSeq2Seq(const Dtype* input,
+                     Dtype* output,
+                     int hidden_size,
+                     cudaStream_t stream);
+
+  bool GetSortedMap(const std::vector<int>& offset_vec, cudaStream_t stream_id);
+
+ private:
+  std::vector<int> length_index_;
+  std::vector<int> emit_offset_vec_;
+  std::vector<int> map_vec_;
+  int emit_length_;
+
+  bool is_reverse_;
+  bool is_bi_;
+  int* dev_map_vec_;
+  int dev_map_vec_length_;
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/huawei_ascend_npu/device.cc
+++ b/lite/backends/huawei_ascend_npu/device.cc
@@ -76,6 +76,7 @@ bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
    }
  }
  VLOG(3) << "Getting input node size " << input_nodes.size();
+  VLOG(3) << "Getting output node size " << output_nodes.size();
  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);

  // Build IR model

--- a/lite/backends/huawei_ascend_npu/model_client.cc
+++ b/lite/backends/huawei_ascend_npu/model_client.cc
@@ -96,7 +96,9 @@ bool AclModelClient::GetModelIOTensorDim(
    ACL_CALL(aclmdlGetInputDims(model_desc_, i, &input_dim));
    aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
    aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
-    TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
+    const std::string name_str(aclmdlGetInputNameByIndex(model_desc_, i));
+    TensorDesc tensor_desc =
+        TensorDesc(name_str, data_type, input_dim, data_format);
    input_tensor->push_back(tensor_desc);
  }

@@ -108,7 +110,9 @@ bool AclModelClient::GetModelIOTensorDim(
    ACL_CALL(aclmdlGetOutputDims(model_desc_, i, &output_dim));
    aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
    aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
-    TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
+    const std::string name_str(aclmdlGetOutputNameByIndex(model_desc_, i));
+    TensorDesc tensor_desc =
+        TensorDesc(name_str, data_type, output_dim, data_format);
    output_tensor->push_back(tensor_desc);
  }
  return true;
@@ -118,12 +122,10 @@ bool AclModelClient::GetTensorFromDataset(
    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
  size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
  size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
-  if (device_output_num != tensor_output_num) {
-    LOG(ERROR)
-        << "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
-        << device_output_num << "tensor number is " << tensor_output_num;
-    return false;
-  }
+  CHECK_EQ(device_output_num, tensor_output_num)
+      << "[HUAWEI_ASCEND_NPU] tensor output number should equal to device "
+         "output number, device output number is "
+      << device_output_num << ", tensor output number is " << tensor_output_num;
  for (size_t i = 0; i < device_output_num; i++) {
    aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
    void* device_data = aclGetDataBufferAddr(buffer_device);
@@ -195,7 +197,10 @@ void AclModelClient::CreateOutputDataset(
    return;
  }
  size_t output_size = aclmdlGetNumOutputs(model_desc_);
-  CHECK_EQ(output_size, output_tensor->size());
+  CHECK_EQ(output_size, output_tensor->size())
+      << "[HUAWEI_ASCEND_NPU] model output number should equal to output "
+         "tensor size, model output number is "
+      << output_size << ", output tensor number is " << output_tensor->size();
  for (size_t i = 0; i < output_size; i++) {
    size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
    void* buffer_device = nullptr;
@@ -244,6 +249,10 @@ bool AclModelClient::ModelExecute(
  VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
          << model_id_;

+  // destroy dataset
+  DestroyDataset(&input_dataset_);
+  DestroyDataset(&output_dataset_);
+
  return true;
 }

@@ -270,16 +279,13 @@ void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
  VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
 }

-bool AclModelClient::UnloadModel() {
+void AclModelClient::UnloadModel() {
  if (!load_flag_) {
    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
                 << load_flag_;
-    return true;
+    return;
  }

-  DestroyDataset(&input_dataset_);
-  DestroyDataset(&output_dataset_);
-
  ACL_CALL(aclmdlUnload(model_id_));
  if (model_desc_ != nullptr) {
    ACL_CALL(aclmdlDestroyDesc(model_desc_));
@@ -299,7 +305,6 @@ bool AclModelClient::UnloadModel() {
  }
  load_flag_ = false;
  VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
-  return true;
 }

 uint32_t AclModelClient::num_devices() {

--- a/lite/backends/huawei_ascend_npu/model_client.h
+++ b/lite/backends/huawei_ascend_npu/model_client.h
@@ -25,15 +25,20 @@ namespace huawei_ascend_npu {

 class TensorDesc {
 public:
-  TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
+  TensorDesc(const std::string name,
+             aclDataType data_type,
+             aclmdlIODims dims,
+             aclFormat format) {
    if (format == ACL_FORMAT_NHWC) {
      dim_order[1] = 3;
      dim_order[2] = 1;
      dim_order[3] = 2;
    }
    // create ge::Tensordesc
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Getting tensor name : " << name;
    ge_tensor_desc_ = new ge::TensorDesc(
        GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
+    ge_tensor_desc_->SetName(name);
    CHECK(ge_tensor_desc_ != nullptr);
    VLOG(3) << "[HUAWEI_ASCEND_NPU] Getting data shape : " << repr();
  }
@@ -145,6 +150,9 @@ class AclModelClient {
  }

  ~AclModelClient() {
+    VLOG(3) << "[HUAWEI_ASCEND_NPU] Unloading model, model id is: "
+            << model_id_;
+    UnloadModel();
    VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
            << device_id_;
    ACL_CALL(aclrtResetDevice(device_id_));
@@ -156,7 +164,6 @@ class AclModelClient {
                           std::vector<TensorDesc>* output_tensor);
  bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
                    std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
-  bool UnloadModel();

 private:
  void CreateInputDataset(
@@ -166,6 +173,7 @@ class AclModelClient {
  bool GetTensorFromDataset(
      std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
  void DestroyDataset(aclmdlDataset** dataset);
+  void UnloadModel();

 private:
  uint32_t num_devices();

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -133,7 +133,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
 lite_cc_library(program SRCS program.cc
    DEPS op kernel model_parser ${ops} ${cpp_wrapper}
    PROFILE_DEPS lite_profiler
-    CUDA_DEPS nvtx_wrapper)
+    CUDA_DEPS nvtx_wrapper cuda_type_trans)

 if (NOT LITE_ON_TINY_PUBLISH)
  lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -23,12 +23,16 @@ lite_cc_library(mir_passes
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
      fusion/scale_activation_fuse_pass.cc
+      fusion/reshape_fuse_pass.cc
      fusion/__xpu__resnet_fuse_pass.cc
      fusion/__xpu__resnet_cbam_fuse_pass.cc
      fusion/__xpu__multi_encoder_fuse_pass.cc
      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
      fusion/__xpu__fc_fuse_pass.cc
      fusion/__xpu__mmdnn_fuse_pass.cc
+      fusion/match_matrix_activation_fuse_pass.cc
+      fusion/scales_fuse_pass.cc
+      fusion/sequence_reverse_embedding_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc

--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -37,6 +37,18 @@ lite_cc_library(fuse_sequence_pool_concat
 lite_cc_library(fuse_scale_activation
        SRCS scale_activation_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_reshape
+        SRCS reshape_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_match_matrix_activation
+        SRCS match_matrix_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scales
+        SRCS scales_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_sequence_reverse_embedding
+        SRCS sequence_reverse_embedding_fuser.cc
+        DEPS pattern_matcher_high_api)

 set(mir_fusers
    fuse_fc
@@ -52,6 +64,10 @@ set(mir_fusers
    fuse_interpolate
    fuse_sequence_pool_concat
    fuse_scale_activation
+    fuse_reshape
+    fuse_match_matrix_activation
+    fuse_scales
+    fuse_sequence_reverse_embedding
    CACHE INTERNAL "fusers")

 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -104,9 +104,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
  auto groups = conv_op_desc->GetAttr<int>("groups");
-  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
-    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
@@ -120,7 +118,6 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
  bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
-
  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;
  alpha_tensor.CopyDataFrom(*bn_bias_t);
@@ -162,12 +159,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
    // compute new conv_weight for int8
    auto weight_scale = conv_op_desc->GetInputScale(weight_name);
-    if (conv_type_ == "conv2d_transpose" && !depthwise) {
-      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
-                   conv_weight_t->dims()[3];
+    if (conv_type_ == "conv2d_transpose") {
+      int cout = conv_weight_t->dims()[1] * groups;
+      int cin_group = conv_weight_t->dims()[0] / groups;
+      int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (int i = 0; i < h; ++i) {
+      for (int k = 0; k < cin_group; ++k) {
+        for (int i = 0; i < cout; ++i) {
          weight_scale[i] *= fabsf(alpha_data[i]);
          if (alpha_data[i] < 0.f) {
            auto ptr_row = conv_weight_d + k * c_size + i * hw;
@@ -203,12 +201,13 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose" && !depthwise) {
-      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
-                   conv_weight_t->dims()[3];
+    if (conv_type_ == "conv2d_transpose") {
+      int cout = conv_weight_t->dims()[1] * groups;
+      int cin_group = conv_weight_t->dims()[0] / groups;
+      int c_size = cout * conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
-      for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
-        for (int i = 0; i < h; ++i) {
+      for (int k = 0; k < cin_group; ++k) {
+        for (int i = 0; i < cout; ++i) {
          auto ptr_row = conv_weight_d + k * c_size + i * hw;
          for (int j = 0; j < hw; ++j) {
            ptr_row[j] *= alpha_data[i];

--- a/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
@@ -75,9 +75,8 @@ void ElementwiseActivationFuser::InsertNewNode(SSAGraph* graph,
 }

 cpp::OpDesc ElementwiseActivationFuser::GenOpDesc(const key2nodes_t& matched) {
-  auto* desc = matched.at("elt")->stmt()->op_info();
-
-  cpp::OpDesc op_desc;
+  auto op_desc = *matched.at("elt")->stmt()->op_info();
+  auto* act_op_desc = matched.at("act")->stmt()->op_info();
  if (eltwise_type_ == "elementwise_add") {
    op_desc.SetType("fusion_elementwise_add_activation");
  } else if (eltwise_type_ == "elementwise_sub") {
@@ -87,13 +86,12 @@ cpp::OpDesc ElementwiseActivationFuser::GenOpDesc(const key2nodes_t& matched) {
  } else {
    LOG(FATAL) << "not supported elementwise_type: " << eltwise_type_;
  }
-
-  op_desc.SetInput("X", {matched.at("x")->arg()->name});
-  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
-
-  op_desc.SetAttr("axis", desc->GetAttr<int>("axis"));
  op_desc.SetAttr("act_type", act_type_);
+  auto& out_name = matched.at("output")->arg()->name;
+  op_desc.SetOutput("Out", {out_name});
+  if (act_op_desc->HasOutputScale(out_name)) {
+    op_desc.SetOutputScale(out_name, act_op_desc->GetOutputScale(out_name));
+  }
  return op_desc;
 }


--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -23,7 +23,7 @@ namespace lite {
 namespace mir {

 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-#ifdef LITE_WITH_X86
+#if defined(LITE_WITH_X86) || defined(LITE_WITH_CUDA)
 #ifdef LITE_WITH_MLU
  fusion::FcFuser fuser(false);
  fuser(graph.get());
@@ -44,7 +44,7 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
    .BindTargets({TARGET(kAny)})
    .ExcludeTargets({TARGET(kXPU)})
-#ifndef LITE_WITH_MLU
+#if (!defined(LITE_WITH_MLU) && !defined(LITE_WITH_HUAWEI_ASCEND_NPU))
    .ExcludeTargets({TARGET(kX86)})
 #endif
    .ExcludeTargets({TARGET(kBM)})

--- a/lite/core/mir/fusion/match_matrix_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/match_matrix_activation_fuse_pass.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/match_matrix_activation_fuse_pass.h"
+
+#include <memory>
+#include <vector>
+
+#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void MatchMatrixActFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::MatchMatrixActFuser fuser("relu");
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_match_matrix_activation_fuse_pass,
+                  paddle::lite::mir::MatchMatrixActFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/match_matrix_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/match_matrix_activation_fuse_pass.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class MatchMatrixActFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/match_matrix_activation_fuser.cc
+++ b/lite/core/mir/fusion/match_matrix_activation_fuser.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/match_matrix_activation_fuser.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void MatchMatrixActFuser::BuildPattern() {
+  // create nodes.
+  auto* x = VarNode("x")->assert_is_op_input("match_matrix_tensor", "X");
+  auto* W = VarNode("W")->assert_is_op_input("match_matrix_tensor", "W");
+  auto* y = VarNode("y")->assert_is_op_input("match_matrix_tensor", "Y");
+  auto* mm = OpNode("match_matrix_tensor", "match_matrix_tensor");
+  auto* mm_out =
+      VarNode("mm_out")->assert_is_op_output("match_matrix_tensor", "Out");
+  auto* mm_tmp =
+      VarNode("mm_tmp")->assert_is_op_output("match_matrix_tensor", "Tmp");
+  auto* act = OpNode("act", activation_);
+  auto* out = VarNode("Out")->assert_is_op_output(activation_, "Out");
+
+  // create topology.
+  std::vector<PMNode*> mm_inputs{x, W, y};
+  std::vector<PMNode*> mm_ouputs{mm_out, mm_tmp};
+  mm_inputs >> *mm >> mm_ouputs;
+
+  // Some op specialities.
+  mm_out->AsIntermediate();
+  mm->AsIntermediate();
+  act->AsIntermediate();
+
+  *mm_out >> *act >> *out;
+}
+
+void MatchMatrixActFuser::InsertNewNode(SSAGraph* graph,
+                                        const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto mm_op = LiteOpRegistry::Global().Create("match_matrix_tensor");
+  auto mm = matched.at("match_matrix_tensor")->stmt()->op();
+  auto* scope = mm->scope();
+  auto& valid_places = mm->valid_places();
+  mm_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(mm_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("y"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+}
+
+cpp::OpDesc MatchMatrixActFuser::GenOpDesc(const key2nodes_t& matched) {
+  auto op_desc = *matched.at("match_matrix_tensor")->stmt()->op_info();
+  int dim_t = matched.at("match_matrix_tensor")
+                  ->stmt()
+                  ->op_info()
+                  ->GetAttr<int>("dim_t");
+  op_desc.mutable_inputs()->clear();
+  op_desc.mutable_outputs()->clear();
+  op_desc.SetType("match_matrix_tensor");
+  op_desc.SetInput("X", {matched.at("x")->arg()->name});
+  op_desc.SetInput("W", {matched.at("W")->arg()->name});
+  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+  op_desc.SetOutput("Tmp", {matched.at("mm_tmp")->arg()->name});
+  op_desc.SetAttr("dim_t", dim_t);
+  op_desc.SetAttr("fuse_relu", true);
+
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/match_matrix_activation_fuser.h
+++ b/lite/core/mir/fusion/match_matrix_activation_fuser.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class MatchMatrixActFuser : public FuseBase {
+ public:
+  explicit MatchMatrixActFuser(std::string activation)
+      : activation_(activation) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string activation_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -175,6 +175,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
  for (int i = 0; i < weight_scale_size; i++) {
    weight_scale.push_back(whole_weight_scale);
  }
+
  op_desc.SetAttr("enable_int8", true);
  op_desc.SetInputScale(weight_name, weight_scale);

@@ -280,9 +281,8 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
    op_desc.SetInput("X", {quantized_op_input->arg()->name});
    op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
  }
-  if (quantized_op_type_ != "conv2d_transpose") {
-    op_desc.SetAttr("enable_int8", true);
-  }
+
+  op_desc.SetAttr("enable_int8", true);
  op_desc.SetInputScale(weight_name, weight_scale);

  // change the weight from the float type to int8 type.

--- a/lite/core/mir/fusion/reshape_fuse_pass.cc
+++ b/lite/core/mir/fusion/reshape_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/reshape_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/reshape_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ReshapeFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> reshape_type_cases{"reshape", "reshape2"};
+  for (auto type_ : reshape_type_cases) {
+    fusion::ReshapeFuser reshape_fuser(type_);
+    reshape_fuser(graph.get());
+  }
+
+  for (auto type_ : reshape_type_cases) {
+    fusion::Reshape2OutFuser reshape2Out_fuser(type_);
+    reshape2Out_fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_reshape_fuse_pass, paddle::lite::mir::ReshapeFusePass)
+    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/fusion/reshape_fuse_pass.h
+++ b/lite/core/mir/fusion/reshape_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ReshapeFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/reshape_fuser.cc
+++ b/lite/core/mir/fusion/reshape_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/reshape_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ReshapeFuser::BuildPattern() {
+  auto* x = VarNode("x");
+  auto* reshape = OpNode("reshape", type_);
+  auto* reshape_out = VarNode("Out");
+  auto* out1 = OpNode("out1");
+
+  *x >> *reshape >> *reshape_out >> *out1;
+}
+
+void ReshapeFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  auto op_desc = const_cast<OpInfo*>(matched.at("reshape")->stmt()->op_info());
+  op_desc->SetAttr<bool>("inplace", true);
+}
+
+void Reshape2OutFuser::BuildPattern() {
+  auto* x = VarNode("x");
+  auto* reshape =
+      OpNode("reshape", type_)->assert_op_attr<bool>("inplace", true);
+  auto* reshape_out = VarNode("Out");
+  auto* out1 = OpNode("out1");
+  auto* out2 = OpNode("out2");
+
+  *x >> *reshape >> *reshape_out >> *out1;
+  *reshape_out >> *out2;
+}
+
+void Reshape2OutFuser::InsertNewNode(SSAGraph* graph,
+                                     const key2nodes_t& matched) {
+  auto op_desc = const_cast<OpInfo*>(matched.at("reshape")->stmt()->op_info());
+  op_desc->SetAttr<bool>("inplace", false);
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/reshape_fuser.h
+++ b/lite/core/mir/fusion/reshape_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ReshapeFuser : public FuseBase {
+ public:
+  explicit ReshapeFuser(const std::string& type) : type_(type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  std::string type_;
+};
+
+class Reshape2OutFuser : public FuseBase {
+ public:
+  explicit Reshape2OutFuser(const std::string& type) : type_(type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  std::string type_;
+};
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.cc
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
@@ -61,20 +61,23 @@ void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
 }

 cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
-  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
-  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
-
+  auto op_desc = *matched.at("scale")->stmt()->op_info();
+  auto* act_op_desc = matched.at("act")->stmt()->op_info();
  op_desc.SetAttr("activation_type", act_type_);
  if (act_type_ == "relu") {
    op_desc.SetAttr("fuse_relu", true);
  } else if (act_type_ == "relu6") {
-    float alpha = act_op_desc.GetAttr<float>("threshold");
+    float alpha = act_op_desc->GetAttr<float>("threshold");
    op_desc.SetAttr("alpha", alpha);
  } else if (act_type_ == "leaky_relu") {
-    float alpha = act_op_desc.GetAttr<float>("alpha");
+    float alpha = act_op_desc->GetAttr<float>("alpha");
    op_desc.SetAttr("alpha", alpha);
  }
+  auto& out_name = matched.at("output")->arg()->name;
+  op_desc.SetOutput("Out", {out_name});
+  if (act_op_desc->HasOutputScale(out_name)) {
+    op_desc.SetOutputScale(out_name, act_op_desc->GetOutputScale(out_name));
+  }
  return op_desc;
 }


--- a/lite/core/mir/fusion/scales_fuse_pass.cc
+++ b/lite/core/mir/fusion/scales_fuse_pass.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scales_fuse_pass.h"
+
+#include <memory>
+#include <vector>
+
+#include "lite/core/mir/fusion/scales_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ScalesFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::ScalesFuser fuser;
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_scales_fuse_pass, paddle::lite::mir::ScalesFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/scales_fuse_pass.h
+++ b/lite/core/mir/fusion/scales_fuse_pass.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ScalesFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scales_fuser.cc
+++ b/lite/core/mir/fusion/scales_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scales_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ScalesFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+
+  auto scales_teller = [](const Node* node) -> bool {
+    bool bias_after_scale =
+        const_cast<Node*>(node)->AsStmt().op_info()->GetAttr<bool>(
+            "bias_after_scale");
+    return bias_after_scale;
+  };
+
+  // create op nodes
+  auto* scale1 = OpNode("scale1", "scale")
+                     ->assert_is_op("scale")
+                     ->assert_node_satisfied(scales_teller)
+                     ->AsIntermediate();
+  auto* scale2 = OpNode("scale2", "scale")
+                     ->assert_is_op("scale")
+                     ->assert_node_satisfied(scales_teller)
+                     ->AsIntermediate();
+
+  // create intermediate nodes
+  auto* scale1_out = VarNode("scale1_out")
+                         ->assert_is_op_output("scale", "Out")
+                         ->assert_is_op_input("scale", "X")
+                         ->AsIntermediate();
+
+  // create output node
+  auto* out = VarNode("out")->assert_is_op_output("scale", "Out")->AsOutput();
+
+  // create topology.
+  *x >> *scale1 >> *scale1_out >> *scale2 >> *out;
+}
+
+void ScalesFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale1")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+}
+
+cpp::OpDesc ScalesFuser::GenOpDesc(const key2nodes_t& matched) {
+  auto op_desc = *matched.at("scale1")->stmt()->op_info();
+  float scale1 = op_desc.GetAttr<float>("scale");
+  float bias1 = op_desc.GetAttr<float>("bias");
+  float scale2 =
+      matched.at("scale2")->stmt()->op_info()->GetAttr<float>("scale");
+  float bias2 = matched.at("scale2")->stmt()->op_info()->GetAttr<float>("bias");
+
+  op_desc.SetAttr<float>("scale", scale1 * scale2);
+  op_desc.SetAttr<float>("bias", bias1 * scale2 + bias2);
+
+  auto& out_name = matched.at("out")->arg()->name;
+  op_desc.SetOutput("Out", {out_name});
+
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scales_fuser.h
+++ b/lite/core/mir/fusion/scales_fuser.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ScalesFuser : public FuseBase {
+ public:
+  ScalesFuser() {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
@@ -23,8 +23,11 @@ namespace lite {
 namespace mir {

 void SequencePoolConcatFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::SequencePoolConcatFuser fuser;
+  fusion::SequencePool7ConcatFuser fuser;
  fuser(graph.get());
+
+  fusion::SequencePool2ConcatFuser fuser2;
+  fuser2(graph.get());
 }

 }  // namespace mir

--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
@@ -21,22 +21,6 @@ namespace lite {
 namespace mir {
 namespace fusion {

-// """
-// merge {sequence_pool x 7, concat} => merge_sequence_pool_and_concat
-//   src1              src2               src7            src1    src2      src7
-//     |                |                                  |       |         |
-//     v                v                                  |       |   ...   |
-// sequence_pool  sequence_pool  ...(sequence_pool)        |       |         |
-//     |                |              |              =>   -------------------
-//     ---------------------------------                          |
-//             |                                                  |
-//             v                                                  v
-//           concat                                     sequence_pool_concat
-// """
-void SequencePoolConcatFuser::BuildPattern() {
-  // create nodes.
-  auto* concat = OpNode("concat", "concat")->AsIntermediate();
-
 #define STR1(R) #R
 #define STR2(R) STR1(R)

@@ -58,6 +42,22 @@ void SequencePoolConcatFuser::BuildPattern() {
  *sequence_pool_##num >> *sequence_pool_##num##_idx;                       \
  *x_##num >> *sequence_pool_##num >> *sequence_pool_##num##_out >> *concat;

+// """
+// merge {sequence_pool x 7, concat} => merge_sequence_pool_and_concat
+//   src1              src2               src7            src1    src2      src7
+//     |                |                                  |       |         |
+//     v                v                                  |       |   ...   |
+// sequence_pool  sequence_pool  ...(sequence_pool)        |       |         |
+//     |                |              |              =>   -------------------
+//     ---------------------------------                          |
+//             |                                                  |
+//             v                                                  v
+//           concat                                     sequence_pool_concat
+// """
+void SequencePool7ConcatFuser::BuildPattern() {
+  // create nodes.
+  auto* concat = OpNode("concat", "concat")->AsIntermediate();
+
  auto* concat_out =
      VarNode("concat_out")->assert_is_op_output("concat", "Out");
  *concat >> *concat_out;
@@ -69,14 +69,10 @@ void SequencePoolConcatFuser::BuildPattern() {
  POOL_CONCAT_PATTERN(5);
  POOL_CONCAT_PATTERN(6);
  POOL_CONCAT_PATTERN(7);
-
-#undef POOL_CONCAT_PATTERN
-#undef STR1
-#undef STR2
 }

-void SequencePoolConcatFuser::InsertNewNode(SSAGraph* graph,
-                                            const key2nodes_t& matched) {
+void SequencePool7ConcatFuser::InsertNewNode(SSAGraph* graph,
+                                             const key2nodes_t& matched) {
  auto op_desc = GenOpDesc(matched);
  auto sequence_pool_concat_op =
      LiteOpRegistry::Global().Create("sequence_pool_concat");
@@ -99,7 +95,7 @@ void SequencePoolConcatFuser::InsertNewNode(SSAGraph* graph,
  IR_NODE_LINK_TO(new_op_node, matched.at("concat_out"));
 }

-cpp::OpDesc SequencePoolConcatFuser::GenOpDesc(const key2nodes_t& matched) {
+cpp::OpDesc SequencePool7ConcatFuser::GenOpDesc(const key2nodes_t& matched) {
  cpp::OpDesc op_desc = *matched.at("concat")->stmt()->op_info();
  op_desc.SetType("sequence_pool_concat");
  op_desc.SetInput("X",
@@ -147,6 +143,64 @@ cpp::OpDesc SequencePoolConcatFuser::GenOpDesc(const key2nodes_t& matched) {
  return op_desc;
 }

+void SequencePool2ConcatFuser::BuildPattern() {
+  // create nodes.
+  auto* concat = OpNode("concat", "concat")->AsIntermediate();
+
+  auto* concat_out =
+      VarNode("concat_out")->assert_is_op_output("concat", "Out");
+  *concat >> *concat_out;
+
+  POOL_CONCAT_PATTERN(1);
+  POOL_CONCAT_PATTERN(2);
+}
+
+void SequencePool2ConcatFuser::InsertNewNode(SSAGraph* graph,
+                                             const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto sequence_pool_concat_op =
+      LiteOpRegistry::Global().Create("sequence_pool_concat");
+
+  auto concat = matched.at("concat")->stmt()->op();
+  auto* scope = concat->scope();
+  auto& valid_places = concat->valid_places();
+  sequence_pool_concat_op->Attach(op_desc, scope);
+
+  auto* new_op_node =
+      graph->GraphCreateInstructNode(sequence_pool_concat_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_1"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_2"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("concat_out"));
+}
+
+cpp::OpDesc SequencePool2ConcatFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("concat")->stmt()->op_info();
+  op_desc.SetType("sequence_pool_concat");
+  op_desc.SetInput("X",
+                   {matched.at("sequence_pool_x_1")->arg()->name,
+                    matched.at("sequence_pool_x_2")->arg()->name});
+
+  std::vector<std::string> pooltypes;
+  pooltypes.push_back(matched.at("sequence_pool_1")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_2")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+
+  op_desc.SetAttr("pooltype", pooltypes);
+  op_desc.SetOutput("Out", {matched.at("concat_out")->arg()->name});
+
+  return op_desc;
+}
+
+#undef POOL_CONCAT_PATTERN
+#undef STR1
+#undef STR2
+
 }  // namespace fusion
 }  // namespace mir
 }  // namespace lite

--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.h
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
@@ -23,7 +23,16 @@ namespace lite {
 namespace mir {
 namespace fusion {

-class SequencePoolConcatFuser : public FuseBase {
+class SequencePool7ConcatFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+class SequencePool2ConcatFuser : public FuseBase {
 public:
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/sequence_reverse_embedding_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void SequenceReverseEmbeddingFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  fusion::SequenceReverseEmbeddingFuser fuser;
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass,
+                  paddle::lite::mir::SequenceReverseEmbeddingFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.h
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuse_pass.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class SequenceReverseEmbeddingFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuser.cc
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/sequence_reverse_embedding_fuser.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void SequenceReverseEmbeddingFuser::BuildPattern() {
+  // create input nodes.
+  auto* x =
+      VarNode("x")->assert_is_op_input("sequence_reverse", "X")->AsInput();
+  auto* w = VarNode("w")->assert_is_op_input("lookup_table", "W")->AsInput();
+
+  // create op nodes
+  auto* sequence_reverse = OpNode("sequence_reverse", "sequence_reverse")
+                               ->assert_is_op("sequence_reverse")
+                               ->AsIntermediate();
+  auto* lookup_table = OpNode("lookup_table", "lookup_table")
+                           ->assert_is_op("lookup_table")
+                           ->AsIntermediate();
+
+  // create intermediate nodes
+  auto* sequence_reverse_out =
+      VarNode("sequence_reverse_out")
+          ->assert_is_op_output("sequence_reverse", "Y")
+          ->assert_is_op_input("lookup_table", "Ids")
+          ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("out")->assert_is_op_output("lookup_table", "Out")->AsOutput();
+
+  // create topology.
+  *x >> *sequence_reverse >> *sequence_reverse_out >> *lookup_table >> *out;
+  *w >> *lookup_table;
+}
+
+void SequenceReverseEmbeddingFuser::InsertNewNode(SSAGraph* graph,
+                                                  const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto fuse_op = LiteOpRegistry::Global().Create("sequence_reverse_embedding");
+  auto lookup_table = matched.at("lookup_table")->stmt()->op();
+  auto* scope = lookup_table->scope();
+  auto& valid_places = lookup_table->valid_places();
+  fuse_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(fuse_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("w"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("out"));
+}
+
+cpp::OpDesc SequenceReverseEmbeddingFuser::GenOpDesc(
+    const key2nodes_t& matched) {
+  auto op_desc = *matched.at("lookup_table")->stmt()->op_info();
+  op_desc.SetType("sequence_reverse_embedding");
+  auto& in_name = matched.at("x")->arg()->name;
+  auto& w_name = matched.at("w")->arg()->name;
+  auto& out_name = matched.at("out")->arg()->name;
+  op_desc.SetInput("Ids", {in_name});
+  op_desc.SetInput("W", {w_name});
+  op_desc.SetOutput("Out", {out_name});
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/sequence_reverse_embedding_fuser.h
+++ b/lite/core/mir/fusion/sequence_reverse_embedding_fuser.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class SequenceReverseEmbeddingFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
@@ -25,6 +25,9 @@ void VarConvActivationFuser::BuildPattern() {
  // create nodes.
  auto* input = VarNode("X")->assert_is_op_input(conv_type_, "X")->AsInput();
  auto* filter = VarNode("W")->assert_is_op_input(conv_type_, "W")->AsInput();
+  auto* column =
+      VarNode("COLUMN")->assert_is_op_input(conv_type_, "COLUMN")->AsInput();
+  auto* row = VarNode("ROW")->assert_is_op_input(conv_type_, "ROW")->AsInput();

  auto* conv2d = OpNode("var_conv_2d", conv_type_)->AsIntermediate();

@@ -42,7 +45,7 @@ void VarConvActivationFuser::BuildPattern() {
      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();

  // create topology.
-  std::vector<PMNode*> conv2d_inputs{filter, input};
+  std::vector<PMNode*> conv2d_inputs{filter, input, column, row};
  conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
  *conv2d >> *conv2d_out_1;
 }
@@ -60,6 +63,8 @@ void VarConvActivationFuser::InsertNewNode(SSAGraph* graph,

  IR_NODE_LINK_TO(matched.at("X"), new_op_node);
  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("COLUMN"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("ROW"), new_op_node);
  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
 }


--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -32,6 +32,7 @@ void QuantizedOpAttributesInferencePass::Apply(
  // Only for fully quantized model which is only supported by MTK and RK NPU.
  // Replace the output_scale with the input_scale of the adjacent quantized
  // ops, and fix the missing of the attribute 'enable_int8'.
+  VLOG(5) << "\n" << Visualize(graph.get());
  for (auto& op_node : graph->StmtTopologicalOrder()) {
    if (!op_node->IsStmt()) continue;
    auto& inst = op_node->AsStmt();

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -91,11 +91,14 @@ class Optimizer {
           // kernels for devices automatically.
           "lite_conv_activation_fuse_pass",              //
           "lite_var_conv_2d_activation_fuse_pass",       //
+           "lite_match_matrix_activation_fuse_pass",      //
           "lite_fc_fuse_pass",                           //
           "lite_shuffle_channel_fuse_pass",              //
           "lite_transpose_softmax_transpose_fuse_pass",  //
           "lite_interpolate_fuse_pass",                  //
           "identity_scale_eliminate_pass",               //
+           "lite_scales_fuse_pass",                       //
+           "lite_sequence_reverse_embedding_fuse_pass",   //
           "elementwise_mul_constant_eliminate_pass",     //
           "lite_sequence_pool_concat_fuse_pass",         //
           "lite_scale_activation_fuse_pass",             //
@@ -161,6 +164,7 @@ class Optimizer {

           "runtime_context_assign_pass",
           "argument_type_display_pass",
+           "lite_reshape_fuse_pass",

           "memory_optimize_pass"}};


--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -159,7 +159,9 @@ RuntimeProgram::RuntimeProgram(
    int block_idx)
    : exec_scope_(exec_scope) {
 #ifdef LITE_WITH_OPENCL
-  bool opencl_valid = CLRuntime::Global()->OpenCLAvaliableForDevice();
+  bool opencl_valid = paddle::lite::CLWrapper::Global()->OpenclLibFound() &&
+                      paddle::lite::CLWrapper::Global()->DlsymSuccess() &&
+                      CLRuntime::Global()->OpenCLAvaliableForDevice();
  using OpenCLContext = Context<TargetType::kOpenCL>;
  std::unique_ptr<KernelContext> unique_opencl_ctx(new KernelContext());
  if (opencl_valid) {

--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -29,6 +29,10 @@ namespace kernels {
 namespace apu {

 bool SubgraphEngine::BuildDeviceProgram() {
+  if (!origin_program_) {
+    BuildOriginProgram();
+  }
+
  unsigned int version;
  Neuron_getVersion(&version);
  VLOG(3) << "Neuron Adapter version: " << version;
@@ -46,9 +50,6 @@ bool SubgraphEngine::BuildDeviceProgram() {

  // Convert all of ops and their input vars and weights and added into the APU
  // NIR graph
-  if (!origin_program_) {
-    BuildOriginProgram();
-  }
  const auto& bridges = subgraph::Registry::Instance();
  const auto& insts = origin_program_->instructions(kRootBlockIdx);
  for (auto& inst : insts) {

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -78,6 +78,7 @@ add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(pixel_shuffle_compute_arm ARM extra SRCS pixel_shuffle_compute.cc DEPS ${lite_kernel_deps} math_arm)

 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -104,6 +105,7 @@ add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite
 add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_pool_grad_compute_arm ARM train SRCS sequence_pool_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)

 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)

--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -40,6 +40,11 @@ void CastCompute::Run() {
    const auto* x_data = param.X->data<float>();
    auto* o_data = param.Out->mutable_data<float>();
    memcpy(o_data, x_data, sizeof(float) * param.X->numel());
+  } else if (param.in_dtype == param.out_dtype &&
+             param.in_dtype == 3) {  // int64->int64
+    const auto* x_data = param.X->data<int64_t>();
+    auto* o_data = param.Out->mutable_data<int64_t>();
+    memcpy(o_data, x_data, sizeof(int64_t) * param.X->numel());
  } else if (param.in_dtype == 21 && param.out_dtype == 5) {  // int8->float32
    const char* x_data_begin = param.X->data<char>();
    const char* x_data_end = x_data_begin + param.X->numel();
@@ -56,7 +61,7 @@ void CastCompute::Run() {
    float* out_data = param.Out->mutable_data<float>();
    std::transform(
        x_data_begin, x_data_end, out_data, TransOp<unsigned char, float>);
-  } else if (param.in_dtype == 3 && param.out_dtype == 2) {
+  } else if (param.in_dtype == 3 && param.out_dtype == 2) {  // int64->int32
    const int64_t* x_data_begin = param.X->data<int64_t>();
    const int64_t* x_data_end = x_data_begin + param.X->numel();
    int32_t* out_data = param.Out->mutable_data<int32_t>();
@@ -72,6 +77,12 @@ void CastCompute::Run() {
    const int64_t* x_data_end = x_data_begin + param.X->numel();
    float* out_data = param.Out->mutable_data<float>();
    std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, float>);
+  } else if (param.in_dtype == 2 && param.out_dtype == 3) {  // INT32 -> INT64
+    const int32_t* x_data_begin = param.X->data<int32_t>();
+    const int32_t* x_data_end = x_data_begin + param.X->numel();
+    int64_t* out_data = param.Out->mutable_data<int64_t>();
+    std::transform(
+        x_data_begin, x_data_end, out_data, TransOp<int32_t, int64_t>);
  } else {
    LOG(FATAL) << "other has not been implemented transform with dtype"
               << param.in_dtype << " X, dtype" << param.out_dtype << " Out";

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -31,7 +31,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  auto paddings = *param.paddings;
  // select dw conv kernel
  if (kw == 3) {
-    // VLOG(5) << "invoke 3x3 dw conv fp32";
    bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
    if (pads_less && paddings[0] == paddings[2] &&
        (paddings[0] == 0 || paddings[0] == 1)) {
@@ -54,7 +53,6 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
    kernel_func_name_ = "conv_depthwise_3x3_fp32";
 #endif
  } else if (kw == 5) {
-    // VLOG(5) << "invoke 5x5 dw conv fp32";
    auto strides = param.strides;
    if ((strides[0] == 1 && strides[1] == 1) ||
        (strides[0] == 2 && strides[1] == 2)) {
@@ -104,23 +102,44 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
      w_scale_[i] = scale[i] * in_scale;
    }
  }
+
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto x_dims = param.x->dims();
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  auto act_param = param.activation_param;
+  bool has_act = act_param.has_active;
+  lite_api::ActivationType act_type = act_param.active_type;
+  // no activation and relu activation is supported now
+  bool support_act_type =
+      (has_act == false) ||
+      (has_act == true && act_type == lite_api::ActivationType::kRelu);
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
+  bool support_width_type = iw > 9 ? true : false;
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_3x3_int8_fp32";
 #endif
-    int cround = ROUNDUP(w_dims[0], 8);
-    weights_.Resize({cround / 8, 1, kh * kw, 8});
-    auto wptr = param.filter->data<int8_t>();
-    auto wptr_new = weights_.mutable_data<int8_t>();
-    lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
-    flag_trans_weights_ = true;
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
+        !support_width_type) {
+      int cround = ROUNDUP(w_dims[0], 8);
+      weights_.Resize({cround / 8, 1, kh * kw, 8});
+      auto wptr = param.filter->data<int8_t>();
+      auto wptr_new = weights_.mutable_data<int8_t>();
+      lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      flag_trans_weights_ = true;
+    } else {
+      flag_trans_weights_ = false;
+    }
  } else if (kw == 5) {
    // trans weights
-    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_5x5_int8_fp32";
@@ -175,23 +194,45 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    param.activation_param.Relu_clipped_coef =
        param.activation_param.Relu_clipped_coef / param.output_scale;
  }
+
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto x_dims = param.x->dims();
+  int iw = x_dims[3];
+  int ih = x_dims[2];
+  auto act_param = param.activation_param;
+
+  bool has_act = act_param.has_active;
+  lite_api::ActivationType act_type = act_param.active_type;
+  // no activation and relu activation is supported now
+  bool support_act_type =
+      (has_act == false) ||
+      (has_act == true && act_type == lite_api::ActivationType::kRelu);
+  bool support_pad_type =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]) &&
+      (paddings[0] == paddings[2]) && (paddings[0] == 0 || paddings[0] == 1);
+  bool support_stride_type = (strides[0] == 1 && strides[1] == 1);
+  bool support_width_type = iw > 9 ? true : false;
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_3x3_int8_int8";
 #endif
-    int cround = ROUNDUP(w_dims[0], 8);
-    weights_.Resize({cround / 8, 1, kh * kw, 8});
-    auto wptr = param.filter->data<int8_t>();
-    auto wptr_new = weights_.mutable_data<int8_t>();
-    lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
-    flag_trans_weights_ = true;
+    if (!support_act_type || !support_pad_type || !support_stride_type ||
+        !support_width_type) {
+      int cround = ROUNDUP(w_dims[0], 8);
+      weights_.Resize({cround / 8, 1, kh * kw, 8});
+      auto wptr = param.filter->data<int8_t>();
+      auto wptr_new = weights_.mutable_data<int8_t>();
+      lite::arm::math::conv_trans_weights_numc(wptr, wptr_new, oc, 1, 8, 9);
+      flag_trans_weights_ = true;
+    } else {
+      flag_trans_weights_ = false;
+    }
  } else if (kw == 5) {
    // trans weights
-    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
 #ifdef LITE_WITH_PROFILE
    kernel_func_name_ = "conv_depthwise_5x5_int8_int8";
@@ -283,7 +324,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
  auto w_dims = param.filter->dims();
  auto o_dims = param.output->dims();

-  int iw = x_dims[3];  // nchw
+  int iw = x_dims[3];
  int ih = x_dims[2];
  int ic = x_dims[1];
  int bs = x_dims[0];
@@ -333,7 +374,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
  auto w_dims = param.filter->dims();
  auto o_dims = param.output->dims();

-  int iw = x_dims[3];  // nchw
+  int iw = x_dims[3];
  int ih = x_dims[2];
  int ic = x_dims[1];
  int bs = x_dims[0];

--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
@@ -73,7 +73,6 @@ void Conv2DTransposeCompute::Run() {
  int kw = w_dims[3];  // oihw
  int kh = w_dims[2];
  int group = param.groups;
-  bool fuse_relu = param.fuse_relu;
  bool flag_bias = (param.bias != nullptr);

  auto paddings = *param.paddings;
@@ -104,6 +103,7 @@ void Conv2DTransposeCompute::Run() {
  auto dout = param.output->mutable_data<float>();
  auto weights = param.filter->data<float>();
  auto act_param = param.activation_param;
+  bool has_act = act_param.has_active;
  for (int i = 0; i < num; i++) {
    const float* din_batch = din + i * chin * hin * win;
    float* dout_batch = dout + i * chout * hout * wout;
@@ -152,13 +152,14 @@ void Conv2DTransposeCompute::Run() {
                                     dout_batch);
    }
    if (flag_bias) {
-      lite::arm::math::fill_bias_relu<float>(
+      act_param.has_active = has_act;
+      lite::arm::math::fill_bias_act<float>(
          dout_batch,
          static_cast<const float*>(param.bias->data<float>()),
          chout,
          wout * hout,
          flag_bias,
-          fuse_relu);
+          &act_param);
    }
  }
 }

--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -78,6 +78,9 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
  weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
  void* trans_tmp_ptr = malloc(sizeof(float) * wino_iw * wino_iw * oc * ic);
  auto weights_data_ = weights_.mutable_data<float>();
+  memset(reinterpret_cast<char*>(weights_data_),
+         0,
+         weights_.numel() * sizeof(float));
  if (!choose_small_) {
    lite::arm::math::weight_trans_c4_8x8(
        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
@@ -251,6 +254,9 @@ void WinogradConv<PRECISION(kInt8), OutType>::ReInitWhenNeeded() {
  weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
  void* trans_tmp_ptr = malloc(sizeof(int16_t) * wino_iw * wino_iw * oc * ic);
  auto weights_data_ = weights_.mutable_data<int16_t>();
+  memset(reinterpret_cast<char*>(weights_data_),
+         0,
+         weights_.numel() * sizeof(int16_t));
  if (!choose_small_) {
  } else {
    lite::arm::math::weight_trans_c8_4x4_int8(

--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -137,10 +137,11 @@ void ElementwiseSubCompute::Run() {
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
  int pre, n, post;
-  if (x_dims.size() < y_dims.size()) {
-    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
-  }
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_broadcast(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
    lite::arm::math::elementwise_sub_broadcast(
        x_data, y_data, out_data, pre, n, post);
  } else {
@@ -158,24 +159,21 @@ void ElementwiseSubActivationCompute::Run() {
  std::string act_type = param.act_type;
  auto x_dims = param.X->dims();
  auto y_dims = param.Y->dims();
-  if (x_dims.size() < y_dims.size()) {
-    LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
-  }
  int pre, n, post;
-  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_sub_relu_broadcast(
-          x_data, y_data, out_data, pre, n, post);
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
+
+  if (act_type != "relu") {
+    LOG(FATAL) << "unsupported Activation type: " << act_type;
+  }
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_relu_broadcast(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_sub_relu_broadcast(
+        x_data, y_data, out_data, pre, n, post);
  } else {
-    if (act_type == "relu") {
-      lite::arm::math::elementwise_sub_relu(
-          x_data, y_data, out_data, x_dims.production());
-    } else {
-      LOG(FATAL) << "unsupported Activation type: " << act_type;
-    }
+    lite::arm::math::elementwise_sub_relu(
+        x_data, y_data, out_data, x_dims.production());
  }
 }


--- a/lite/kernels/arm/pixel_shuffle_compute.cc
+++ b/lite/kernels/arm/pixel_shuffle_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/pixel_shuffle_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void PixelShuffleCompute::Run() {
+  auto& param = Param<operators::PixelShuffleParam>();
+
+  const float* x_data = param.x->data<float>();
+  float* output_data = param.output->mutable_data<float>();
+  int upscale_factor = param.upscale_factor;
+
+  int batch_size = param.x->dims()[0];
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+  int out_channels = param.output->dims()[1];
+  int out_height = param.output->dims()[2];
+  int out_width = param.output->dims()[3];
+
+#pragma omp parallel for
+  for (int nc = 0; nc < batch_size * out_channels; nc++) {
+    const float* inptr = x_data + nc * out_height * out_width;
+    float* outptr_nc = output_data + nc * out_height * out_width;
+
+    for (int sh = 0; sh < upscale_factor; sh++) {
+      for (int sw = 0; sw < upscale_factor; sw++) {
+        float* outptr = outptr_nc + sh * out_width + sw;
+        for (int h = 0; h < height; h++) {
+          for (int w = 0; w < width; w++) {
+            outptr[0] = inptr[0];
+            inptr++;
+            outptr += upscale_factor;
+          }
+          outptr += (upscale_factor - 1) * out_width;
+        }
+      }
+    }
+  }
+
+#ifdef LITE_WITH_PROFILE
+  kernel_func_name_ = "pixel_shuffle_func";
+#endif
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pixel_shuffle,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::PixelShuffleCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/pixel_shuffle_compute.h
+++ b/lite/kernels/arm/pixel_shuffle_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/operators/pixel_shuffle_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class PixelShuffleCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PixelShuffleParam;
+
+  void Run() override;
+
+  virtual ~PixelShuffleCompute() = default;
+
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForPixelShuffle"};
+#endif
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/reduce_max_compute.cc
+++ b/lite/kernels/arm/reduce_max_compute.cc
@@ -25,6 +25,7 @@ void ReduceMaxCompute::Run() {
  auto& param = Param<operators::ReduceMaxParam>();
  const float* input = param.X->data<float>();
  auto x_dims = param.X->dims();
+
  int x_rank = x_dims.size();
  float* output = param.Out->mutable_data<float>();
  bool keep_dim = param.keep_dim;
@@ -37,41 +38,74 @@ void ReduceMaxCompute::Run() {
      }
    }
  }
-  int n_in = x_dims[0];
-  int c_in = x_dims[1];
-  int h_in = x_dims[2];
-  int w_in = x_dims[3];
-  if (dim.size() == 0) {
-    lite::arm::math::reduce_all(input, output, n_in, c_in, h_in, w_in);
-  } else if (dim.size() == 1) {
-    switch (dim[0]) {
-      case 0:
-        lite::arm::math::reduce_n(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 1:
-        lite::arm::math::reduce_c(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 2:
-        lite::arm::math::reduce_h(input, output, n_in, c_in, h_in, w_in);
-        break;
-      case 3:
-        lite::arm::math::reduce_w(input, output, n_in, c_in, h_in, w_in);
-        break;
-      default:
-        LOG(FATAL) << "error!!!";
+
+  if (x_dims.size() == 3) {
+    if (dim.size() == 0 || dim.size() == 3) {
+      lite::arm::math::reduce_all_of_three(
+          input, output, x_dims[0], x_dims[1], x_dims[2]);
+    } else if (dim.size() == 1) {
+      switch (dim[0]) {
+        case 0:
+          lite::arm::math::reduce_first_of_three(
+              input, output, x_dims[0], x_dims[1], x_dims[2]);
+          break;
+        case 1:
+          lite::arm::math::reduce_second_of_three(
+              input, output, x_dims[0], x_dims[1], x_dims[2]);
+          break;
+
+        case 2:
+          lite::arm::math::reduce_third_of_three(
+              input, output, x_dims[0], x_dims[1], x_dims[2]);
+          break;
+        default:
+          LOG(FATAL) << "error!!!";
+      }
+    } else if (dim.size() == 2) {
+      LOG(FATAL) << "Will support later!!";
+    } else {
+      LOG(FATAL) << "dim size should not larger than 3!!!";
    }
-  } else if (dim.size() == 2) {
-    if (dim[0] == 0 && dim[1] == 1) {
-      lite::arm::math::reduce_nc(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 1 && dim[1] == 2) {
-      lite::arm::math::reduce_ch(input, output, n_in, c_in, h_in, w_in);
-    } else if (dim[0] == 2 && dim[1] == 3) {
-      lite::arm::math::reduce_hw(input, output, n_in, c_in, h_in, w_in);
+  } else if (x_dims.size() == 4) {
+    int n_in = x_dims[0];
+    int c_in = x_dims[1];
+    int h_in = x_dims[2];
+    int w_in = x_dims[3];
+
+    if (dim.size() == 0) {
+      lite::arm::math::reduce_all(input, output, n_in, c_in, h_in, w_in);
+    } else if (dim.size() == 1) {
+      switch (dim[0]) {
+        case 0:
+          lite::arm::math::reduce_n(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 1:
+          lite::arm::math::reduce_c(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 2:
+          lite::arm::math::reduce_h(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 3:
+          lite::arm::math::reduce_w(input, output, n_in, c_in, h_in, w_in);
+          break;
+        default:
+          LOG(FATAL) << "error!!!";
+      }
+    } else if (dim.size() == 2) {
+      if (dim[0] == 0 && dim[1] == 1) {
+        lite::arm::math::reduce_nc(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 1 && dim[1] == 2) {
+        lite::arm::math::reduce_ch(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 2 && dim[1] == 3) {
+        lite::arm::math::reduce_hw(input, output, n_in, c_in, h_in, w_in);
+      } else {
+        LOG(FATAL) << "invalid dim!!";
+      }
    } else {
-      LOG(FATAL) << "invalid dim!!";
+      LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
    }
  } else {
-    LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
+    LOG(FATAL) << "only support input with 3&4 dimensions now!!";
  }
 }


--- a/lite/kernels/arm/sequence_pool_compute.cc
+++ b/lite/kernels/arm/sequence_pool_compute.cc
@@ -32,6 +32,7 @@ void SequencePoolCompute::Run() {
  auto& output = param.Out;
  const auto* din = param.X->data<float>();
  float* dout = output->mutable_data<float>();
+  int64_t* max_index = param.MaxIndex->mutable_data<int64_t>();
  const auto pool_type = param.pool_type;
  const auto lod = param.X->lod()[0];

@@ -44,9 +45,9 @@ void SequencePoolCompute::Run() {
  } else if (pool_type == "SQRT") {
    lite::arm::math::seq_pool_sqrt(din, dout, lod, width);
  } else if (pool_type == "MAX") {
-    lite::arm::math::seq_pool_max(din, dout, lod, width);
+    lite::arm::math::seq_pool_max(din, dout, max_index, lod, width);
  } else if (pool_type == "MIN") {
-    lite::arm::math::seq_pool_min(din, dout, lod, width);
+    lite::arm::math::seq_pool_min(din, dout, max_index, lod, width);
  } else if (pool_type == "FIRST") {
    lite::arm::math::seq_pool_first(din, dout, lod, width);
  } else if (pool_type == "LAST") {

--- a/lite/kernels/arm/sequence_pool_grad_compute.cc
+++ b/lite/kernels/arm/sequence_pool_grad_compute.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/kernels/arm/sequence_pool_grad_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SequencePoolGradCompute::PrepareForRun() {}
+
+void SequencePoolGradCompute::Run() {
+  auto& param = Param<operators::SequencePoolGradParam>();
+  auto& output_grad = param.Out_Grad;
+  auto& x_grad = param.X_Grad;
+  const auto* din_ptr = param.X->data<float>();
+  const auto* dout_grad_ptr = output_grad->data<float>();
+  const auto* index_grad_ptr = param.MaxIndex_Grad->data<int64_t>();
+  float* x_grad_ptr = x_grad->mutable_data<float>();
+  const auto pool_type = param.pool_type;
+  const auto lod = param.X->lod()[0];
+  int64_t width = param.X->numel() / param.X->dims()[0];
+  if (pool_type == "SUM") {
+    lite::arm::math::seq_pool_sum_grad(
+        din_ptr, dout_grad_ptr, x_grad_ptr, lod, width);
+  } else if (pool_type == "AVERAGE") {
+    lite::arm::math::seq_pool_average_grad(
+        din_ptr, dout_grad_ptr, x_grad_ptr, lod, width);
+  } else if (pool_type == "SQRT") {
+    lite::arm::math::seq_pool_sqrt_grad(
+        din_ptr, dout_grad_ptr, x_grad_ptr, lod, width);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    lite::arm::math::seq_pool_max_grad(
+        din_ptr, dout_grad_ptr, index_grad_ptr, x_grad_ptr, lod, width);
+  } else if (pool_type == "FIRST") {
+    lite::arm::math::seq_pool_first_grad(
+        din_ptr, dout_grad_ptr, x_grad_ptr, lod, width);
+  } else if (pool_type == "LAST") {
+    lite::arm::math::seq_pool_last_grad(
+        din_ptr, dout_grad_ptr, x_grad_ptr, lod, width);
+  } else {
+    LOG(ERROR) << " UNKNOWN sequence pool type";
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool_grad,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SequencePoolGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/sequence_pool_grad_compute.h
+++ b/lite/kernels/arm/sequence_pool_grad_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SequencePoolGradCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SequencePoolGradCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/slice_compute.cc
+++ b/lite/kernels/arm/slice_compute.cc
@@ -169,21 +169,47 @@ void SliceCompute<T, PType>::Run() {
 using slice_float =
    paddle::lite::kernels::arm::SliceCompute<float, PRECISION(kFloat)>;
 REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_float, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
    .Finalize();
+
 using slice_int32 =
    paddle::lite::kernels::arm::SliceCompute<int, PRECISION(kInt32)>;
 REGISTER_LITE_KERNEL(slice, kARM, kInt32, kNCHW, slice_int32, def)
    .BindInput("Input",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .Finalize();
+
+using slice_int64 =
+    paddle::lite::kernels::arm::SliceCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(slice, kARM, kInt64, kNCHW, slice_int64, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
@@ -21,9 +21,10 @@ namespace lite {
 namespace kernels {
 namespace arm {

-void SplitCompute::Run() {
-  auto& param = Param<operators::SplitParam>();
-  const float* din = param.x->data<float>();
+template <typename T, PrecisionType PType>
+void SplitCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::SplitParam>();
+  const T* din = param.x->template data<T>();
  auto& dout = param.output;
  auto in_dim = param.x->dims();
  std::vector<int> in_strides(in_dim.size());
@@ -42,12 +43,24 @@ void SplitCompute::Run() {
 }  // namespace lite
 }  // namespace paddle

-REGISTER_LITE_KERNEL(
-    split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+using split_float =
+    paddle::lite::kernels::arm::SplitCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(split, kARM, kFloat, kNCHW, split_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
    .BindInput("AxisTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindInput("SectionsTensorList",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
+
+using split_int64 =
+    paddle::lite::kernels::arm::SplitCompute<int64_t, PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(split, kARM, kInt64, kNCHW, split_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SectionsTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
    .Finalize();
--- a/lite/kernels/arm/split_compute.h
+++ b/lite/kernels/arm/split_compute.h
@@ -22,7 +22,8 @@ namespace lite {
 namespace kernels {
 namespace arm {

-class SplitCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class SplitCompute : public KernelLite<TARGET(kARM), PType> {
 public:
  void Run() override;


--- a/lite/kernels/arm/split_compute_test.cc
+++ b/lite/kernels/arm/split_compute_test.cc
@@ -93,13 +93,13 @@ void split_compute_ref(const operators::SplitParam& param) {
 }

 TEST(split_arm, init) {
-  SplitCompute split;
+  SplitCompute<float, PRECISION(kFloat)> split;
  ASSERT_EQ(split.precision(), PRECISION(kFloat));
  ASSERT_EQ(split.target(), TARGET(kARM));
 }

 TEST(split_arm, compute) {
-  SplitCompute split;
+  SplitCompute<float, PRECISION(kFloat)> split;
  operators::SplitParam param;

  lite::Tensor x;

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -38,6 +38,7 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
 add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_reverse_embedding_compute_cuda CUDA extra SRCS sequence_reverse_embedding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})

--- a/lite/kernels/cuda/gru_compute.cu
+++ b/lite/kernels/cuda/gru_compute.cu
@@ -14,6 +14,7 @@
 #include "lite/kernels/cuda/gru_compute.h"

 #include <string>
+#include <vector>

 #include "lite/backends/cuda/cuda_utils.h"
 #include "lite/backends/cuda/math/bias.h"
@@ -273,6 +274,8 @@ void GRUCompute<T, PType>::Run() {
  auto& param = this->template Param<param_t>();

  auto* input = param.input;
+  T* x_data =
+      const_cast<lite::Tensor*>(input)->template mutable_data<T>(TARGET(kCUDA));
  lite::Tensor* h0{nullptr};
  if (param.h0) {
    h0 = const_cast<lite::Tensor*>(param.h0);
@@ -289,7 +292,7 @@ void GRUCompute<T, PType>::Run() {
  lite::Tensor* hidden = param.hidden;
  T* batch_reset_hidden_prev_data =
      batch_reset_hidden_prev->template mutable_data<T>(TARGET(kCUDA));
-  hidden->template mutable_data<T>(TARGET(kCUDA));
+  T* out_data = hidden->template mutable_data<T>(TARGET(kCUDA));
  T* batch_gate_data = batch_gate->template mutable_data<T>(TARGET(kCUDA));
  T* batch_hidden_data = batch_hidden->template mutable_data<T>(TARGET(kCUDA));
  bool is_reverse = param.is_reverse;
@@ -300,14 +303,28 @@ void GRUCompute<T, PType>::Run() {
  auto hidden_dims = hidden->dims();
  int frame_size = hidden_dims[1];

-  lite::cuda::math::LoDTensor2BatchFunctor<T> batch_func;
-  batch_func(*input, batch_gate, is_reverse, stream);
+  LoD offset_vec_vec = input->lod();
+  std::vector<int> offset(offset_vec_vec[offset_vec_vec.size() - 1].size());
+  for (size_t i = 0; i < offset_vec_vec[offset_vec_vec.size() - 1].size();
+       ++i) {
+    offset[i] = static_cast<int>(offset_vec_vec[offset_vec_vec.size() - 1][i]);
+  }
+  bool need_process = seq_utils_.GetSortedMap(offset, stream);
+  int emit_length = seq_utils_.GetEmitOffsetVec().size() - 1;
+  auto emit_offset_vec = seq_utils_.GetEmitOffsetVec();
+  if (need_process) {
+    seq_utils_.Seq2SortedSeq(
+        input->template data<T>(), batch_gate_data, 3 * frame_size, stream);
+    x_data = batch_gate_data;
+    out_data = batch_hidden_data;
+  }

  if (bias) {
+    // TODO(wilber): validate when bias is not nullptr
    lite::cuda::math::RowwiseAdd<T> add_bias;
-    add_bias(batch_gate_data,
+    add_bias(x_data,
             bias->template data<T>(),
-             batch_gate_data,
+             x_data,
             frame_size,
             batch_gate->numel(),
             stream);
@@ -320,6 +337,7 @@ void GRUCompute<T, PType>::Run() {
    // Since the batch computing for GRU reorders the input sequences
    // according to their length. The initialized cell state also needs
    // to reorder.
+    // TODO(wilber): validate when h0 is not nullptr
    ordered_h0_.Resize(h0->dims());
    lite::cuda::math::CopyMatrixRowsFunctor<T> row_shuffle;
    row_shuffle(*h0, &ordered_h0_, batch_gate->lod()[2], true, stream);
@@ -327,15 +345,13 @@ void GRUCompute<T, PType>::Run() {
  } else {
    gru_value.prev_out_value = nullptr;
  }
-  auto batch_starts = batch_gate->lod()[0];
-  size_t num_batch = batch_starts.size() - 1;
-  for (size_t n = 0; n < num_batch; ++n) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
+  for (size_t n = 0; n < emit_length; ++n) {
+    int bstart = emit_offset_vec[n];
+    int bend = emit_offset_vec[n + 1];
    int cur_batch_size = bend - bstart;

-    gru_value.output_value = batch_hidden_data + bstart * frame_size;
-    gru_value.gate_value = batch_gate_data + bstart * frame_size * 3;
+    gru_value.output_value = out_data + bstart * frame_size;
+    gru_value.gate_value = x_data + bstart * frame_size * 3;
    gru_value.reset_output_value =
        batch_reset_hidden_prev_data + bstart * frame_size;

@@ -349,10 +365,13 @@ void GRUCompute<T, PType>::Run() {
                               &context);
    gru_value.prev_out_value = gru_value.output_value;
  }
-
-  lite::cuda::math::Batch2LoDTensorFunctor<T> to_seq;
-  batch_hidden->set_lod(batch_gate->lod());
-  to_seq(*batch_hidden, hidden, stream);
+  if (need_process) {
+    seq_utils_.SortedSeq2Seq(batch_hidden_data,
+                             hidden->mutable_data<T>(TARGET(kCUDA)),
+                             frame_size,
+                             stream);
+  }
+  hidden->set_lod(input->lod());
 }

 }  // namespace cuda

--- a/lite/kernels/cuda/gru_compute.h
+++ b/lite/kernels/cuda/gru_compute.h
@@ -16,6 +16,7 @@
 #include <memory>

 #include "lite/backends/cuda/math/gemm.h"
+#include "lite/backends/cuda/math/sequence_helper.h"
 #include "lite/core/kernel.h"
 #include "lite/operators/op_params.h"

@@ -38,6 +39,7 @@ class GRUCompute : public KernelLite<TARGET(kCUDA), PType> {
 private:
  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
  lite::Tensor ordered_h0_;
+  lite::cuda::math::SeqSortedseqTranseUtil seq_utils_;
 };

 }  // namespace cuda

--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -52,6 +52,7 @@ __global__ void padding_out(const dtype* src,
                            const int max_len_r,
                            const int tl,
                            const int count,
+                            const bool fuse_relu,
                            dtype* dst) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  int thread_num = blockDim.x * gridDim.x;
@@ -62,7 +63,13 @@ __global__ void padding_out(const dtype* src,
    int r_id = tid % max_len_r;
    int cur_len = offset[seq_id + 1] - offset[seq_id];
    if (r_id < cur_len) {
-      dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
+      if (fuse_relu) {
+        dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id] > 0
+                       ? src[(offset[seq_id] + r_id) * tl + tl_id]
+                       : 0;
+      } else {
+        dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
+      }
    } else {
      dst[tid] = 0.f;
    }
@@ -86,6 +93,7 @@ void MatchMatrixTensorCompute::Run() {
  auto* tmp = param.tmp;
  int dim_t = param.dim_t;
  int dim_in = x->dims()[1];
+  bool fuse_relu = param.fuse_relu;

  const auto& offset_l = x->lod()[0];
  const auto& offset_r = y->lod()[0];
@@ -155,6 +163,7 @@ void MatchMatrixTensorCompute::Run() {
                                                   max_len_r,
                                                   dim_t * len_l,
                                                   count,
+                                                   fuse_relu,
                                                   out_data);
  out->set_lod(y->lod());
 }

--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
@@ -37,6 +37,40 @@ __global__ void SequenceMaskKernel(T* dst,
  }
 }

+template <typename T>
+__global__ void VecMaxKernel(const T* in_data, T* out, const int count) {
+  extern __shared__ T cache[];
+
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int cache_index = threadIdx.x;
+  T tmp = -1;
+
+  while (i < count) {
+    if (in_data[i] > tmp) {
+      tmp = in_data[i];
+    }
+    i += blockDim.x * gridDim.x;
+  }
+  cache[cache_index] = tmp;
+
+  __syncthreads();
+
+  // perform parallel reduction, blockDim.x must be 2^n
+  int ib = blockDim.x / 2;
+  while (ib != 0) {
+    if (cache_index < ib && cache[cache_index + ib] > cache[cache_index]) {
+      cache[cache_index] = cache[cache_index + ib];
+    }
+
+    __syncthreads();
+
+    ib /= 2;
+  }
+  if (cache_index == 0) {
+    out[blockIdx.x] = cache[0];
+  }
+}
+
 template <typename T, PrecisionType Ptype>
 void SequenceMaskCompute<T, Ptype>::Run() {
  auto& param = this->template Param<param_t>();
@@ -57,11 +91,34 @@ void SequenceMaskCompute<T, Ptype>::Run() {
  }

  if (maxlen < 0) {
-    maxlen = static_cast<int>(
-        thrust::reduce(thrust::device_pointer_cast(x_data),
-                       thrust::device_pointer_cast(x_data) + x->numel(),
-                       static_cast<int64_t>(0),
-                       thrust::maximum<int64_t>()));
+    // choose algorithm according to magic_num.
+    const int magic_num = 256;
+    std::vector<int64_t> h_max_data;
+    if (x->numel() < magic_num) {
+      h_max_data.resize(x->numel());
+      TargetWrapperCuda::MemcpySync(h_max_data.data(),
+                                    x_data,
+                                    x->numel() * sizeof(int64_t),
+                                    IoDirection::DtoH);
+    } else {
+      const int threads = 256;
+      const int blocks = (x->numel() + threads - 1) / threads;
+      max_tensor_.Resize({blocks});
+      auto* max_data = max_tensor_.mutable_data<int64_t>(TARGET(kCUDA));
+      VecMaxKernel<
+          int64_t><<<blocks, threads, threads * sizeof(int64_t), stream>>>(
+          x_data, max_data, x->numel());
+      h_max_data.resize(blocks);
+      TargetWrapperCuda::MemcpyAsync(h_max_data.data(),
+                                     max_data,
+                                     sizeof(int64_t) * blocks,
+                                     IoDirection::DtoH,
+                                     stream);
+      TargetWrapperCuda::StreamSync(stream);
+    }
+    auto maxlen_iterator =
+        std::max_element(h_max_data.begin(), h_max_data.end());
+    maxlen = h_max_data[std::distance(h_max_data.begin(), maxlen_iterator)];
  }

  auto y_dim = x->dims().Vectorize();

--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
@@ -28,6 +28,9 @@ class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {

  void Run() override;
  virtual ~SequenceMaskCompute() = default;
+
+ private:
+  lite::Tensor max_tensor_;
 };

 }  // namespace cuda

--- a/lite/kernels/cuda/sequence_reverse_embedding_compute.cu
+++ b/lite/kernels/cuda/sequence_reverse_embedding_compute.cu
--- a/lite/kernels/cuda/sequence_reverse_embedding_compute.h
+++ b/lite/kernels/cuda/sequence_reverse_embedding_compute.h
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
--- a/lite/kernels/host/expand_as_compute.cc
+++ b/lite/kernels/host/expand_as_compute.cc
--- a/lite/kernels/host/expand_as_compute.h
+++ b/lite/kernels/host/expand_as_compute.h
--- a/lite/kernels/host/one_hot_compute.cc
+++ b/lite/kernels/host/one_hot_compute.cc
--- a/lite/kernels/host/one_hot_compute.h
+++ b/lite/kernels/host/one_hot_compute.h
--- a/lite/kernels/host/one_hot_compute_test.cc
+++ b/lite/kernels/host/one_hot_compute_test.cc
--- a/lite/kernels/host/pixel_shuffle_compute.cc
+++ b/lite/kernels/host/pixel_shuffle_compute.cc
--- a/lite/kernels/host/pixel_shuffle_compute.h
+++ b/lite/kernels/host/pixel_shuffle_compute.h
--- a/lite/kernels/host/where_index_compute.cc
+++ b/lite/kernels/host/where_index_compute.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
--- a/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/act_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/batch_norm_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/concat_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/concat_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/conv_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/dropout_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/dropout_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/fc_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/fc_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/flatten_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/flatten_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/graph.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/graph.h
--- a/lite/kernels/huawei_ascend_npu/bridges/interpolate_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/interpolate_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
--- a/lite/kernels/huawei_ascend_npu/bridges/pool_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/pool_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/reshape_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/reshape_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/softmax_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/softmax_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/transpose_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/transpose_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
--- a/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
+++ b/lite/kernels/huawei_ascend_npu/subgraph_compute.cc
--- a/lite/kernels/rknpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/rknpu/bridges/batch_norm_op.cc
--- a/lite/kernels/rknpu/bridges/concat_op.cc
+++ b/lite/kernels/rknpu/bridges/concat_op.cc
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
--- a/lite/kernels/rknpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/rknpu/bridges/elementwise_ops.cc
--- a/lite/kernels/rknpu/bridges/fc_op.cc
+++ b/lite/kernels/rknpu/bridges/fc_op.cc
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
--- a/lite/kernels/rknpu/bridges/softmax_op.cc
+++ b/lite/kernels/rknpu/bridges/softmax_op.cc
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
--- a/lite/model_parser/base/apis.h
+++ b/lite/model_parser/base/apis.h
--- a/lite/model_parser/base/param_desc.h
+++ b/lite/model_parser/base/param_desc.h
--- a/lite/model_parser/base/traits.h
+++ b/lite/model_parser/base/traits.h
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
--- a/lite/model_parser/base/vector_view.h
+++ b/lite/model_parser/base/vector_view.h
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
--- a/lite/model_parser/flatbuffers/CMakeLists.txt
+++ b/lite/model_parser/flatbuffers/CMakeLists.txt
--- a/lite/model_parser/flatbuffers/block_desc.cc
+++ b/lite/model_parser/flatbuffers/block_desc.cc
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
--- a/lite/model_parser/flatbuffers/io.h
+++ b/lite/model_parser/flatbuffers/io.h
--- a/lite/model_parser/flatbuffers/io_test.cc
+++ b/lite/model_parser/flatbuffers/io_test.cc
--- a/lite/model_parser/flatbuffers/op_desc.cc
+++ b/lite/model_parser/flatbuffers/op_desc.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/param.fbs
+++ b/lite/model_parser/flatbuffers/param.fbs
--- a/lite/model_parser/flatbuffers/param_desc.cc
+++ b/lite/model_parser/flatbuffers/param_desc.cc
--- a/lite/model_parser/flatbuffers/param_desc.h
+++ b/lite/model_parser/flatbuffers/param_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.cc
+++ b/lite/model_parser/flatbuffers/program_desc.cc
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/program_desc_test.cc
+++ b/lite/model_parser/flatbuffers/program_desc_test.cc
--- a/lite/model_parser/flatbuffers/test_helper.h
+++ b/lite/model_parser/flatbuffers/test_helper.h
--- a/lite/model_parser/flatbuffers/traits.h
+++ b/lite/model_parser/flatbuffers/traits.h
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
--- a/lite/model_parser/flatbuffers/vector_view.h
+++ b/lite/model_parser/flatbuffers/vector_view.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
--- a/lite/operators/expand_as_op.cc
+++ b/lite/operators/expand_as_op.cc
--- a/lite/operators/expand_as_op.h
+++ b/lite/operators/expand_as_op.h
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
--- a/lite/operators/one_hot_op.cc
+++ b/lite/operators/one_hot_op.cc
--- a/lite/operators/one_hot_op.h
+++ b/lite/operators/one_hot_op.h
--- a/lite/operators/one_hot_op_test.cc
+++ b/lite/operators/one_hot_op_test.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pixel_shuffle_op.cc
+++ b/lite/operators/pixel_shuffle_op.cc
--- a/lite/operators/pixel_shuffle_op.h
+++ b/lite/operators/pixel_shuffle_op.h
--- a/lite/operators/sequence_pool_grad_op.cc
+++ b/lite/operators/sequence_pool_grad_op.cc
--- a/lite/operators/sequence_pool_grad_op.h
+++ b/lite/operators/sequence_pool_grad_op.h
--- a/lite/operators/sequence_pool_op.cc
+++ b/lite/operators/sequence_pool_op.cc
--- a/lite/operators/sequence_reverse_embedding_op.cc
+++ b/lite/operators/sequence_reverse_embedding_op.cc
--- a/lite/operators/sequence_reverse_embedding_op.h
+++ b/lite/operators/sequence_reverse_embedding_op.h
--- a/lite/operators/softmax_op.cc
+++ b/lite/operators/softmax_op.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
--- a/lite/tests/kernels/conv_compute_test.cc
+++ b/lite/tests/kernels/conv_compute_test.cc
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
--- a/lite/tests/kernels/expand_as_compute_test.cc
+++ b/lite/tests/kernels/expand_as_compute_test.cc
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
--- a/lite/tests/kernels/flatten_compute_test.cc
+++ b/lite/tests/kernels/flatten_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
--- a/lite/tests/kernels/pixel_shuffle_compute_test.cc
+++ b/lite/tests/kernels/pixel_shuffle_compute_test.cc
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
--- a/lite/tests/kernels/reduce_max_compute_test.cc
+++ b/lite/tests/kernels/reduce_max_compute_test.cc
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
--- a/lite/tests/kernels/search_seq_fc_compute_test.cc
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
--- a/lite/tests/kernels/sequence_pool_grad_compute_test.cc
+++ b/lite/tests/kernels/sequence_pool_grad_compute_test.cc
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh