update cide

Merge branch 'int8' of https://github.com/chenjiaoAngel/Paddle-Lite into int8

update cide
Merge branch 'int8' of https://github.com/chenjiaoAngel/Paddle-Lite into int8
8c1cb2af · chenjiaoAngel · 63c8675c · dddf058b · 8c1cb2af · 8c1cb2af
195 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -63,6 +63,16 @@ test/models/

 test/images/

+*.pyc
+
+# model
+*.nb
+*.svg
+*.dot
+
+# vim intermediate files
+*.swp
+
 # Emacs intermediate files
 *~


--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,7 +97,7 @@ lite_option(LITE_WITH_FPGA   "Enable FPGA support in lite" OFF)
 lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
 lite_option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
 lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
-lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
+lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
 lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options

--- a/README.md
+++ b/README.md
@@ -61,7 +61,8 @@ For demands of Apple's GPU Metal and web front end inference, please see `./meta
 Paddle Lite has referenced the following open-source projects:

 - [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. 
+- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite.  
+


 ## Feedback and Community Support

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -186,8 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
  add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
 endif()

-if (LITE_SHUTDOWN_LOG)
-  add_definitions("-DLITE_SHUTDOWN_LOG")
+if (LITE_WITH_LOG)
+  add_definitions("-DLITE_WITH_LOG")
 endif()

 if (LITE_ON_TINY_PUBLISH)

--- a/cmake/device/apu.cmake
+++ b/cmake/device/apu.cmake
@@ -32,34 +32,3 @@ endif()
 message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")

 include_directories("${APU_DDK_ROOT}/include")
-
-set(APU_SUB_LIB_PATH "lib64")
-if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
-    set(APU_SUB_LIB_PATH "lib64")
-endif()
-
-find_library(APU_NEURON_FILE NAMES neuron
-  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
-
-find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
-  PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
-
-if(NOT APU_NEURON_FILE)
-  message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
-else()
-  message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
-  add_library(apu_neuron SHARED IMPORTED GLOBAL)
-  set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
-endif()
-
-if(NOT APU_NEURON_ADAPTER_FILE)
-  message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
-else()
-  message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
-  add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
-  set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
-endif()
-
-set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
-message(STATUS "${apu_runtime_libs}")
-
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -45,7 +45,7 @@ else()
        # we changed the source code to adapt for windows compiling
        #         git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
        ######################################################################################################
-        URL             https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        URL             http://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
        DOWNLOAD_NO_PROGRESS  1
        PREFIX          ${EIGEN_SOURCE_DIR}

--- a/docs/api_reference/cxx_api_doc.md
+++ b/docs/api_reference/cxx_api_doc.md
@@ -400,7 +400,7 @@ std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<MobileConfig>

 - `None`

-返回：内存中模型结构数据
+返回：内存中模型参数数据

 返回类型：`const std::string&`


--- a/docs/demo_guides/baidu_xpu.md
+++ b/docs/demo_guides/baidu_xpu.md
+# PaddleLite使用百度XPU预测部署
+
+Paddle Lite已支持百度XPU在x86和arm服务器（例如飞腾 FT-2000+/64）上进行预测部署。
+目前支持Kernel和子图两种接入方式，其中子图接入方式与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成XTCL组网API进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- 昆仑818-100（推理芯片）
+- 昆仑818-300（训练芯片）
+
+### 已支持的设备
+
+- K100/K200昆仑AI加速卡
+
+### 已支持的Paddle模型
+
+- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
+- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
+- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
+- YOLOv3
+- Mask R-CNN
+- Faster R-CNN
+- UNet
+- SENet
+- SSD
+- 百度内部业务模型（由于涉密，不方便透露具体细节）
+
+### 已支持（或部分支持）的Paddle算子（Kernel接入方式）
+
+- scale
+- relu
+- tanh
+- sigmoid
+- stack
+- matmul
+- pool2d
+- slice
+- lookup_table
+- elementwise_add
+- elementwise_sub
+- cast
+- batch_norm
+- mul
+- layer_norm
+- softmax
+- conv2d
+- io_copy
+- io_copy_once
+- __xpu__fc
+- __xpu__multi_encoder
+- __xpu__resnet50
+- __xpu__embedding_with_eltwise_add
+
+### 已支持（或部分支持）的Paddle算子（子图/XTCL接入方式）
+
+- relu
+- tanh
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- pool2d
+- softmax
+- mul
+- batch_norm
+- stack
+- gather
+- scale
+- lookup_table
+- slice
+- transpose
+- transpose2
+- reshape
+- reshape2
+- layer_norm
+- gelu
+- dropout
+- matmul
+- cast
+- yolo_box
+
+
+## 参考示例演示
+
+### 测试设备(K100昆仑AI加速卡)
+
+![baidu_xpu](https://paddlelite-demo.bj.bcebos.com/devices/baidu/baidu_xpu.jpg)
+
+### 准备设备环境
+
+- K100/200昆仑AI加速卡[规格说明书](https://paddlelite-demo.bj.bcebos.com/devices/baidu/K100_K200_spec.pdf)，如需更详细的规格说明书或购买产品，请联系欧阳剑ouyangjian@baidu.com；
+- K100为全长半高PCI-E卡，K200为全长全高PCI-E卡，要求使用PCI-E x16插槽，且需要单独的8针供电线进行供电；
+- 安装K100/K200驱动，目前支持Ubuntu和CentOS系统，由于驱动依赖Linux kernel版本，请正确安装对应版本的驱动安装包。
+
+### 准备本地编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Linux开发环境进行配置；
+- 由于编译示例程序需要依赖OpenCV和CMake 3.10.3，请执行如下命令进行安装；
+
+```shell
+$ sudo apt-get update
+$ sudo apt-get install gcc g++ make wget unzip libopencv-dev pkg-config
+$ wget https://www.cmake.org/files/v3.10/cmake-3.10.3.tar.gz
+$ tar -zxvf cmake-3.10.3.tar.gz
+$ cd cmake-3.10.3
+$ ./configure
+$ make
+$ sudo make install
+```
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/baidu/PaddleLite-linux-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-linux-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - resnet50_fp32_224_fluid # Paddle fluid non-combined格式的resnet50 float32模型
+          - __model__ # Paddle fluid模型组网文件，可拖入https://lutzroeder.github.io/netron/进行可视化显示网络结构
+          - bn2a_branch1_mean # Paddle fluid模型参数文件
+          - bn2a_branch1_scale
+          ...
+    - shell
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的，适用于amd64的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+  - libs
+    - PaddleLite
+      - amd64
+        - include # PaddleLite头文件
+        - lib
+          - libiomp5.so # Intel OpenMP库
+          - libmklml_intel.so # Intel MKL库
+          - libxpuapi.so # XPU API库，提供设备管理和算子实现。
+          - llibxpurt.so # XPU runtime库
+          - libpaddle_full_api_shared.so # 预编译PaddleLite full api库
+      - arm64
+        - include # PaddleLite头文件
+        - lib
+          - libxpuapi.so # XPU API库，提供设备管理和算子实现。
+          - llibxpurt.so # XPU runtime库
+          - libpaddle_full_api_shared.so # 预编译PaddleLite full api库
+```
+
+- 进入PaddleLite-linux-demo/image_classification_demo/shell，直接执行./run.sh amd64即可；
+
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./run.sh amd64 # 默认已生成amd64版本的build/image_classification_demo，因此，无需重新编译示例程序就可以执行。
+$ ./run.sh arm64 # 需要在arm64(FT-2000+/64)服务器上执行./build.sh arm64后才能执行该命令。
+...
+AUTOTUNE:(12758016, 16, 1, 2048, 7, 7, 512, 1, 1, 1, 1, 0, 0, 0) = 1by1_bsp(1, 32, 128, 128)
+Find Best Result in 150 choices, avg-conv-op-time = 40 us
+[INFO][XPUAPI][/home/qa_work/xpu_workspace/xpu_build_dailyjob/api_root/baidu/xpu/api/src/wrapper/conv.cpp:274] Start Tuning: (12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0)
+AUTOTUNE:(12758016, 16, 1, 512, 7, 7, 512, 3, 3, 1, 1, 1, 1, 0) = wpinned_bsp(1, 171, 16, 128)
+Find Best Result in 144 choices, avg-conv-op-time = 79 us
+I0502 22:34:18.176113 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+I0502 22:34:18.176406 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.176697 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 0 cost: 2.116000 ms
+I0502 22:34:18.178530 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.178792 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 1 cost: 2.101000 ms
+I0502 22:34:18.180634 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.180881 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 2 cost: 2.089000 ms
+I0502 22:34:18.182726 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.182976 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 3 cost: 2.085000 ms
+I0502 22:34:18.184814 15876 io_copy_compute.cc:36] host to xpu, copy size 602112
+I0502 22:34:18.185068 15876 io_copy_compute.cc:75] xpu to host, copy size 4000
+iter 4 cost: 2.101000 ms
+warmup: 1 repeat: 5, average: 2.098400 ms, max: 2.116000 ms, min: 2.085000 ms
+results: 3
+Top0  tabby, tabby cat - 0.689418
+Top1  tiger cat - 0.190557
+Top2  Egyptian cat - 0.112354
+Preprocess time: 1.553000 ms
+Prediction time: 2.098400 ms
+Postprocess time: 0.081000 ms
+```
+
+- 如果需要更改测试图片，可将图片拷贝到PaddleLite-linux-demo/image_classification_demo/assets/images目录下，然后将run.sh的IMAGE_NAME设置成指定文件名即可；
+- 如果需要重新编译示例程序，直接运行./build.sh amd64或./build.sh arm64即可。
+
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./build.sh amd64 # For amd64
+$ ./build.sh arm64 # For arm64(FT-2000+/64) 
+```
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到ResNet50 float32模型[resnet50_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)；
+- 由于XPU一般部署在Server端，因此将使用PaddleLite的full api加载原始的Paddle Fluid模型进行预测，即采用CXXConfig配置相关参数。
+
+### 更新支持百度XPU的Paddle Lite库
+
+- 下载PaddleLite源码；
+
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+```
+
+- 下载xpu_toolchain for amd64 or arm64(FT-2000+/64)；
+
+```shell
+$ wget <URL_to_download_xpu_toolchain>
+$ tar -xvf output.tar.gz
+$ mv output xpu_toolchain
+```
+
+- 编译full_publish for amd64 or arm64(FT-2000+/64)；
+
+```shell
+For amd64，如果报找不到cxx11::符号的编译错误，请将gcc切换到4.8版本。
+$ ./lite/tools/build.sh --build_xpu=ON --xpu_sdk_root=./xpu_toolchain x86
+
+For arm64(FT-2000+/64)
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --build_xpu=ON --xpu_sdk_root=./xpu_toolchain --with_log=ON full_publish
+```
+
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录；
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。
+
+## 其它说明
+
+- 如需更进一步的了解相关产品的信息，请联系欧阳剑ouyangjian@baidu.com；
+- 百度昆仑的研发同学正在持续适配更多的Paddle算子，以便支持更多的Paddle模型。
--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -48,7 +48,7 @@ cuda的编译结果位于 `build_cuda/inference_lite_lib`

 4、 `demo` 文件夹：c++ demo.

-如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite_core.so`。
+如果编译打开了python选项，则会在 `build_cuda/inference_lite_lib/python/lib/` 目录下生成 `lite.so`。

 ## 运行

@@ -66,7 +66,7 @@ wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/kite.jpg

 二： 运行   

-**NOTE:**此处示例使用的是python接口。
+**NOTE：** 此处示例使用的是python接口。

 ``` python
 #-*- coding: utf-8 -*-
@@ -75,7 +75,7 @@ import sys
 import numpy as np
 import cv2
 sys.path.append('build_cuda/inference_lite_lib/python/lib')
-from lite_core import *
+from lite import *

 def read_img(im_path, resize_h, resize_w):
  im = cv2.imread(im_path).astype('float32')

--- a/docs/demo_guides/mediatek_apu.md
+++ b/docs/demo_guides/mediatek_apu.md
+# PaddleLite使用MTK APU预测部署
+
+Paddle Lite已支持MTK APU的预测部署。
+其接入原理是与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成MTK的Neuron adapter API（类似Android NN API）进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- [MT8168](https://www.mediatek.cn/products/tablets/mt8168)/[MT8175](https://www.mediatek.cn/products/tablets/mt8175)及其他智能芯片。
+
+### 已支持的设备
+
+- MT8168-P2V1 Tablet。
+
+### 已支持的Paddle模型
+
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)
+
+### 已支持（或部分支持）的Paddle算子
+
+- relu
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- elementwise_mul
+- fc
+- pool2d
+- softmax
+
+## 参考示例演示
+
+### 测试设备(MT8168-P2V1 Tablet)
+
+![mt8168_p2v1_tablet_front](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_front.jpg)
+
+![mt8168_p2v1_tablet_back](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_back.jpg)
+
+### 准备设备环境
+
+- 由于需要依赖特定版本的firmware，感兴趣的同学通过MTK官网[https://www.mediatek.cn/about/contact-us](https://www.mediatek.cn/about/contact-us)提供的联系方式（类别请选择"销售"），获取测试设备和firmware；
+
+### 准备交叉编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/PaddleLite-android-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-android-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
+        - mobilenet_v1_int8_224_for_apu.nb # 已通过opt转好的、适合mtk apu的mobilenetv1量化模型
+    - shell # android shell端的示例程序
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的android shell端的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+    - apk # 常规android应用程序
+      - app
+        - src
+          - main
+            - java # java层代码
+            - cpp # 自定义的jni实现
+        - app.iml
+        - build.gradle
+      - gradle
+      ...
+  - libs
+    - PaddleLite
+      - arm64-v8a
+        - include # PaddleLite头文件
+        - lib
+          - libc++_shared.so
+          - libpaddle_light_api_shared.so # 预编译PaddleLite库
+    - OpenCV # OpenCV 4.2 for android
+```
+
+- Android shell端的示例程序
+  - 进入PaddleLite-android-demo/image_classification_demo/shell，直接执行./run.sh即可，注意：run.sh不能在docker环境执行，否则可能无法找到设备；
+  - 如果需要更改测试图片，可将图片拷贝到PaddleLite-android-demo/image_classification_demo/assets/images目录下，然后将run.sh的IMAGE_NAME设置成指定文件名即可；
+  - 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行必须在docker环境中，否则可能编译出错；
+  - 需要说明的是，由于MTK APU暂时只支持NHWC的数据布局格式，而PaddleLite默认使用NCHW的数据布局格式，导致额外增加了预测中输入张量的NCHW到NHWC的转换，大约耗费8~9ms。
+```shell
+$ cd PaddleLite-android-demo/image_classification_demo/shell
+$ ./run.sh
+...
+warmup: 5 repeat: 10, average: 30.998502 ms, max: 31.049002 ms, min: 30.937002 ms
+results: 3
+Top0  Egyptian cat - -0.122845
+Top1  tabby, tabby cat - -0.122845
+Top2  tiger cat - -0.544028
+Preprocess time: 3.620000 ms
+Prediction time: 30.998502 ms
+Postprocess time: 0.069000 ms
+
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b00000, pa = 0xfb3f9000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af8000, pa = 0xfb3fa000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af7000, pa = 0xf8ffe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af6000, pa = 0xf7bfe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1af5000, pa = 0xf7bfd000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0c000, pa = 0xfb3fe000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0b000, pa = 0xfb3ff000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b0a000, pa = 0xf31ff000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b09000, pa = 0xfb3f6000, len = 255
+[vpuBuffer] vpuMemAllocator::freeMem: type = 1, va = 0x7ed1b08000, pa = 0xf7bff000, len = 255
+```
+
+- 常规Android应用程序
+  - 安装Android Studio 3.4
+  - 打开Android Studio，在"Welcome to Android Studio"窗口点击"Open an existing Android Studio project"，在弹出的路径选择窗口中进入"image_classification_demo"目录，然后点击右下角的"Open"按钮即可导入工程
+  - 通过USB连接Android手机、平板或开发板；
+  - 临时关闭selinux模式，允许app调用系统库；
+```shell
+$ adb root
+# setenforce 0
+```
+  - 载入工程后，点击菜单栏的Run->Run 'App'按钮，在弹出的"Select Deployment Target"窗口选择已经连接的Android设备，然后点击"OK"按钮；
+  - 等待大约1分钟后（第一次时间比较长，需要耐心等待），app已经安装到设备上。默认使用ARM CPU模型进行预测，由于MT8168的CPU由四核Arm-Cortex A53组成，性能较一般手机的A7x系列要弱很多，如下图所示，只有6fps；
+
+![mt8168_p2v1_tablet_cpu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_cpu.jpg)
+
+  - 点击app界面右下角的设置按钮，在弹出的设置页面点击"Choose pre-installed models"，选择"mobilenet_v1_int8_for_apu"，点击返回按钮后，app将切换到APU模型，如下图所示，帧率提高到14fps。
+
+![mt8168_p2v1_tablet_apu](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mt8168_p2v1_tablet_apu.jpg)
+
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)；
+- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化（注意：由于MTK APU只支持量化OP，在启动量化脚本时请注意相关参数的设置），最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/mediatek/mobilenet_v1_int8_224_fluid.tar.gz)；
+- 参考[模型转化方法](../user_guides/model_optimize_tool)，利用opt工具转换生成MTK APU模型，仅需要将valid_targets设置为apu,arm即可。
+```shell
+$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
+    --optimize_out_type=naive_buffer \
+    --optimize_out=mobilenet_v1_int8_224_for_apu \
+    --valid_targets=apu,arm
+```
+- 注意：opt生成的模型只是标记了MTK APU支持的Paddle算子，并没有真正生成MTK APU模型，只有在执行时才会将标记的Paddle算子转成MTK Neuron adapter API调用实现组网，最终生成并执行模型。
+
+### 更新支持MTK APU的Paddle Lite库
+
+- 下载PaddleLite源码和APU DDK；
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+$ wget https://paddlelite-demo.bj.bcebos.com/devices/mediatek/apu_ddk.tar.gz
+$ tar -xvf apu_ddk.tar.gz
+```
+- 编译full_publish and tiny_publish for MT8168-P2V1 Tablet
+```shell
+$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk full_publish
+$ ./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_shared --build_extra=ON --with_log=ON --build_apu=ON --apu_ddk_root=./apu_ddk tiny_publish
+```
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/include替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/include目录；
+- 将编译生成的build.lite.android.armv8.gcc/inference_lite_lib.android.armv8.apu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-android-demo/libs/PaddleLite/arm64-v8a/lib/libpaddle_light_api_shared.so文件。
+
+
+## 其它说明
+
+- 由于涉及到License的问题，无法提供用于测试的firmware，我们深感抱歉。如果确实对此非常感兴趣，可以参照之前提到的联系方式，直接联系MTK的销售；
+- MTK研发同学正在持续增加用于适配Paddle算子bridge/converter，以便适配更多Paddle模型。
--- a/docs/demo_guides/npu.md
+++ b/docs/demo_guides/npu.md
@@ -126,3 +126,80 @@ $ ./lite/tools/build_npu.sh --arm_os=android --arm_abi=armv7 --arm_lang=gcc --an
 - 华为达芬奇架构的NPU内部大量采用float16进行运算，因此，预测结果会存在偏差，但大部分情况下精度不会有较大损失，可参考[Paddle-Lite-Demo](https://github.com/PaddlePaddle/Paddle-Lite-Demo)中Image Classification Demo for Android对同一张图片CPU与NPU的预测结果。
 - 华为Kirin 810/990 Soc搭载的自研达芬奇架构的NPU，与Kirin 970/980 Soc搭载的寒武纪NPU不一样，同样的，与Hi3559A、Hi3519A使用的NNIE也不一样，Paddle Lite只支持华为自研达芬奇架构NPU。
 - 我们正在持续增加能够适配HiAI IR的Paddle算子bridge/converter，以便适配更多Paddle模型，同时华为研发同学也在持续对HiAI IR性能进行优化。
+
+
+## 手动分割子图
+
+### 背景
+- Paddle-Lite已经支持了大量的华为NPU的算子，但是仍然不能满足所有模型的需求。对于一个有部分算子不支持的模型，Paddle-Lite会将模型划分为可以跑在NPU上的子图和跑在CPU上的子图，实现NPU和CPU自动调度的功能，通常情况下可以获得比较好的性能。在一些特殊情况下，模型会被自动划分为比较多的子图，导致CPU和NPU的切换开销很大，从而导致整体性能变差。因此，需要手动分割子图的功能来指定一些算子跑在CPU上，避免子图过多。
+
+### 功能
+- 通过配置文件来指定需要强制跑在CPU上的算子
+
+### 使用方法
+- 1、通过netron打开paddle模型文件，可以查看模型结构，获得算子的类型、输入名称。输出名称。
+    - 注意：Paddle-Lite会对模型进行优化，模型算子可以改变，需要以优化后的模型算子为准。后面会举例说明。
+- 2、生成配置文件 ```split_cfg.txt```，记录需要跑在CPU上的算子信息。
+    - 每行一条OP记录信息，以冒号":"分隔"op名称"，"op输入名"，"op输出名"，以逗号","分隔"op输入名"和"op输出名"中的不同var名。
+    - 可以部分省略输入或者输出名。比如：```op3:in3_var0```表示，指定类型为"op3"，输入为"in3_var0"的算子；```op4```表示所有类型为"op4"的算子
+    - 例子1：
+    ```
+    op0:in0_var0,in0_var1:out0_var0,out0_var1
+    op1:in1_var0,in1_var1:out1_var0
+    op2::out2_var0
+    op3:in3_var0
+    op4
+    ```
+    - 例子2：
+    ```
+    transpose:conv2d_22.tmp_1:transpose_0.tmp_0
+    ```
+    ![image](https://user-images.githubusercontent.com/50474132/80475316-4a5fda80-897b-11ea-910a-6aee13243387.png)
+
+- 3、使用环境变量```SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE```指定配置文件的位置。
+    - 例如：
+    ```
+    export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=/data/local/tmp/split_sfg.txt
+    ```
+- 4、以上步骤完成后，运行的模型中符合条件的算子将被强制跑在CPU上。
+
+### 举例
+- 以模型[image](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz)为例
+
+- 1、可以使用netron查看模型
+
+- 2、初步分析
+
+    - 下图是ssd_mobilenet_v1中的部分结构。其中红色部分暂时不支持在NPU上运行，蓝色部分可能NPU上的性能不理想。此时，如果直接让预测库自动调度的话，可能会分成多个子图，而且整体性能不佳。因此，可以将蓝色部分和绿色部分整体指定在CPU上运行，让其他部分自动运行在NPU上(红色部分会自动在CPU上运行)。
+    ![ssd_mobilenet_v1_example](https://user-images.githubusercontent.com/50474132/80453173-525b5280-895a-11ea-847f-c7dd5b5799de.png)
+
+- 3、使用opt转换模型
+
+    - opt转换过程中会打印log信息。在log中搜索```digraph G```和```// end G```可以找到优化后的模型图。
+    ![image](https://user-images.githubusercontent.com/50474132/80454098-145f2e00-895c-11ea-9f16-dde1483a9beb.png)
+    ![image](https://user-images.githubusercontent.com/50474132/80454123-1de89600-895c-11ea-86b9-a62d78a6616d.png)
+    - 将从```digraph G```开始的，到```// end G```结束的整段模型图信息，保存到```.dot```格式的文件中。可以用```graphviz```打开查看，或者在[网页版](http://dreampuf.github.io/GraphvizOnline/)查看。
+    ![image](https://user-images.githubusercontent.com/50474132/80454841-47ee8800-895d-11ea-9531-5689c5560fcb.png)
+    - 在此处确认需要被指定的算子是否被优化了。(期望是被指定的算子都还独立存在，如果被融合为了一个算子，需要指定此时融合后的算子)。
+
+- 4、写配置文件
+
+    - 在配置文件中指定可以支持NPU但是需要指定在CPU上运行的算子。
+    ```
+    reshape
+    transpose
+    concat
+    softmax
+    ```
+    - 由于这些算子都指定在NPU上运行，因此不需要特意配置算子的输入输出名称。
+
+- 5、指定配置文件路径
+
+    - 通过```export SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE=your_split_config_file```的方式实现。
+
+- 6、性能测试
+
+    - 设备：华为mate30 5G
+    - HIAI ddk版本：320
+    - 性能：CPU约71.8ms，NPU约16.6ms。
+    
--- a/docs/demo_guides/rockchip_npu.md
+++ b/docs/demo_guides/rockchip_npu.md
+# PaddleLite使用RK NPU预测部署
+
+Paddle Lite已支持RK NPU的预测部署。
+其接入原理是与之前华为NPU类似，即加载并分析Paddle模型，将Paddle算子转成RK组网API进行网络构建，在线生成并执行模型。
+
+## 支持现状
+
+### 已支持的芯片
+
+- RK1808, RK1806，暂时不支持RK3399Pro。
+
+### 已支持的设备
+
+- RK1808/1806 EVB。
+
+### 已支持的Paddle模型
+
+- [全量化MobileNetV1](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)
+
+### 已支持（或部分支持）的Paddle算子
+
+- relu
+- conv2d
+- depthwise_conv2d
+- pool2d
+- fc
+- softmax
+- batch_norm
+- concat
+- elementwise_add
+- elementwise_sub
+- elementwise_mul
+- elementwise_div
+
+## 参考示例演示
+
+### 测试设备(RK1808 EVB)
+
+![rk1808_evb_front](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_front.jpg)
+
+![rk1808_evb_back](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/rk1808_evb_back.jpg)
+
+### 准备设备环境
+
+- 需要依赖特定版本的firmware，请参照[rknpu_ddk](https://github.com/airockchip/rknpu_ddk)的说明对设备进行firmware的更新；
+- 由于RK1808 EVB在刷firmware后，只是一个纯净的Linux系统，无法像Ubuntu那样使用apt-get命令方便的安装软件，因此，示例程序和PaddleLite库的编译均采用交叉编译方式；
+- 将MicroUSB线插入到设备的MicroUSB OTG口，就可以使用Android的adb命令进行设备的交互，再也不用配置网络使用ssh或者通过串口的方式访问设备了，这个设计非常赞！
+
+### 准备交叉编译环境
+
+- 为了保证编译环境一致，建议参考[源码编译](../user_guides/source_compile)中的Docker开发环境进行配置。
+
+### 运行图像分类示例程序
+
+- 从[https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/PaddleLite-linux-demo.tar.gz)下载示例程序，解压后清单如下：
+
+```shell
+- PaddleLite-linux-demo
+  - image_classification_demo
+    - assets
+      - images 
+        - tabby_cat.jpg # 测试图片
+        - tabby_cat.raw # 已处理成raw数据的测试图片
+      - labels
+        - synset_words.txt # 1000分类label文件
+      - models
+        - mobilenet_v1_int8_224_for_cpu.nb # 已通过opt转好的、适合arm cpu的mobilenetv1量化模型
+        - mobilenet_v1_int8_224_for_rknpu.nb # 已通过opt转好的、适合rknpu的mobilenetv1量化模型
+    - shell
+      - CMakeLists.txt # 示例程序CMake脚本
+      - build
+        - image_classification_demo # 已编译好的示例程序
+      - image_classification_demo.cc # 示例程序源码
+      - convert_to_raw_image.py # 将测试图片保存为raw数据的python脚本
+      - build.sh # 示例程序编译脚本
+      - run.sh # 示例程序运行脚本
+  - libs
+    - PaddleLite
+      - arm64
+        - include # PaddleLite头文件
+        - lib
+          - libGAL.so # RK DDK库
+          - libOpenVX.so
+          - libVSC.so
+          - librknpu_ddk.so
+          - libgomp.so.1 # gnuomp库
+          - libpaddle_light_api_shared.so # 预编译PaddleLite库
+      - armhf
+        - include # PaddleLite头文件
+        - lib
+          - libGAL.so
+          - libOpenVX.so
+          - libVSC.so
+          - librknpu_ddk.so
+          - libgomp.so.1
+          - libpaddle_light_api_shared.so
+```
+
+- 进入PaddleLite-linux-demo/image_classification_demo/shell，直接执行./run.sh arm64即可，注意：run.sh不能在docker环境执行，否则无法找到设备；
+```shell
+$ cd PaddleLite-linux-demo/image_classification_demo/shell
+$ ./run.sh arm64 # For RK1808 EVB
+$ ./run.sh armhf # For RK1806 EVB 
+...
+warmup: 5 repeat: 10, average: 6.499500 ms, max: 6.554000 ms, min: 6.468000 ms
+results: 3
+Top0  Egyptian cat - 0.532328
+Top1  tabby, tabby cat - 0.345136
+Top2  tiger cat - 0.111146
+Preprocess time: 2.414000 ms
+Prediction time: 6.499500 ms
+Postprocess time: 0.414000 ms
+```
+- 如果需要更改测试图片，可通过convert_to_raw_image.py工具生成；
+- 如果需要重新编译示例程序，直接运行./build.sh即可，注意：build.sh的执行必须在docker环境中，否则可能编译出错。
+
+
+### 更新模型
+
+- 通过Paddle Fluid训练，或X2Paddle转换得到MobileNetv1 foat32模型[mobilenet_v1_fp32_224_fluid](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz)；
+- 参考[模型量化-有校准数据训练后量化](../user_guides/post_quant_with_data)使用PaddleSlim对float32模型进行量化（注意：由于RK NPU只支持tensor-wise的全量化模型，在启动量化脚本时请注意相关参数的设置），最终得到全量化MobileNetV1模型[mobilenet_v1_int8_224_fluid](https://paddlelite-demo.bj.bcebos.com/devices/rockchip/mobilenet_v1_int8_224_fluid.tar.gz)；
+- 参考[模型转化方法](../user_guides/model_optimize_tool)，利用opt工具转换生成RKNPU模型，仅需要将valid_targets设置为rknpu,arm即可。
+```shell
+$ ./opt --model_dir=mobilenet_v1_int8_224_fluid \
+    --optimize_out_type=naive_buffer \
+    --optimize_out=mobilenet_v1_int8_224_for_rknpu \
+    --valid_targets=rknpu,arm
+```
+- 注意：opt生成的模型只是标记了RKNPU支持的Paddle算子，并没有真正生成RK NPU模型，只有在执行时才会将标记的Paddle算子转成RK NPU组网API，最终生成并执行模型。
+
+### 更新支持RK NPU的Paddle Lite库
+
+- 下载PaddleLite源码和RK DDK；
+```shell
+$ git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+$ cd Paddle-Lite
+$ git checkout <release-version-tag>
+$ git clone https://github.com/airockchip/rknpu_ddk.git
+```
+- 编译full_publish and tiny_publish for RK1808 and RK1806 EVB
+```shell
+For RK1808 EVB
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+
+For RK1806 EVB
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk full_publish
+$ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv7 --arm_lang=gcc --build_extra=ON --with_log=ON --build_rknpu=ON --rknpu_ddk_root=./rknpu_ddk tiny_publish
+```
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
+- 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_light_api_shared.so文件；
+- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/armhf/include目录；
+- 将编译生成的build.lite.armlinux.armv7.gcc/inference_lite_lib.armlinux.armv7.rknpu/cxx/lib/libpaddle_light_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/armhf/lib/libpaddle_light_api_shared.so文件。
+
+## 其它说明
+
+- RK研发同学正在持续增加用于适配Paddle算子bridge/converter，以便适配更多Paddle模型。
--- a/docs/demo_guides/x86.md
+++ b/docs/demo_guides/x86.md
@@ -2,15 +2,14 @@

 Paddle-Lite 支持在Docker或Linux环境编译x86预测库。环境搭建参考[环境准备](../user_guides/source_compile)。

-(注意：非docker Linux环境需要是Ubuntu16.04)
-
 ## 编译

 1、 下载代码
 ```bash
+# 下载Paddle-Lite源码
 git clone https://github.com/PaddlePaddle/Paddle-Lite.git
 # 切换到release分支
-git checkout release/v2.3
+git checkout release/v2.6.0
 ```

 2、 源码编译
@@ -18,6 +17,9 @@ git checkout release/v2.3
 ```bash
 cd Paddle-Lite
 ./lite/tools/build.sh x86
+
+# 其他可选择编译选项
+# --with_log=OFF 关闭LOG信息输出
 ```

 ## 编译结果说明
@@ -31,35 +33,68 @@ x86编译结果位于 `build.lite.x86/inference_lite_lib`

 - `include`  : 头文件
 - `lib` : 库文件
-  - 打包的静态库文件：
-    - `libpaddle_api_full_bundled.a`  ：包含 full_api 和 light_api 功能的静态库
-    - `libpaddle_api_light_bundled.a` ：只包含 light_api 功能的静态库
-  - 打包的动态态库文件：
-    - `libpaddle_full_api_shared.so` ：包含 full_api 和 light_api 功能的动态库
-    - `libpaddle_light_api_shared.so`：只包含 light_api 功能的动态库
+  - 静态库文件：
+    - `libpaddle_api_full_bundled.a`  ：full_api 静态库
+    - `libpaddle_api_light_bundled.a` ：light_api 静态库
+  - 动态库文件：
+    - `libpaddle_full_api_shared.so` ：full_api 动态库
+    - `libpaddle_light_api_shared.so`：light_api 动态库
+
+3、 `third_party` 文件夹：依赖的第三方预测库mklml
+
+- mklml : Paddle-Lite预测库依赖的mklml数学库
+
+4、 `demo/cxx`文件夹：x86预测库的C++ 示例demo
+
+- `mobilenetv1_full` ：使用full_api 执行mobilenet_v1预测的C++ demo
+- `mobilenetv1_light` ：使用light_api 执行mobilenet_v1预测的C++ demo
+
+

-3、 `third_party` 文件夹：第三方库文件

 ## x86预测API使用示例

-1、我们提供Linux环境下x86 API运行mobilenet_v1的示例：[mobilenet_full_x86demo](https://paddlelite-data.bj.bcebos.com/x86/mobilenet_full_x86demo.zip)。下载解压后内容如下：
+1、`mobilenetv1_full`目录结构

-![](https://paddlelite-data.bj.bcebos.com/x86/x86-doc/demo.png)
+```bash
+mobilenetv1_full/
+|-- CMakeLists.txt
+|-- build.sh
+`-- mobilenet_full_api.cc
+```

-`mobilenet_v1`为模型文件、`lib`和`include`分别是Paddle-Lite的预测库和头文件、`third_party`下是编译时依赖的第三方库`mklml`、`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。
+本demo使用cmake构建`CMakeLists.txt`为cmake脚本，`mobilenet_full_api.cc`是x86示例的源代码、`build.sh`为编译的脚本。

-2、demo内容与使用方法
+2、demo使用方法

 ``` bash
 # 1、编译
+cd mobilenetv1_full
 sh build.sh
 ```
 编译结果为当前目录下的 `mobilenet_full_api `
 ``` bash
 # 2、执行预测
-mobilenet_full_api mobilenet_v1
+./mobilenet_full_api ./mobilenet_v1
 ```
-`mobilenet_v1`为当前目录下的模型路径，`mobilenet_full_api`为第一步编译出的可执行文件。
+下载并解压模型[`mobilenet_v1`](http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz)到当前目录，执行以上命令进行预测。
+
+```bash
+# 3、执行demo后输出结果如下，全一输入下mobilenet_v1的预测结果
+Output shape 1000
+Output[0]: 0.000191312
+Output[100]: 0.000159713
+Output[200]: 0.000264313
+Output[300]: 0.000210793
+Output[400]: 0.00103236
+Output[500]: 0.000110071
+Output[600]: 0.00482924
+Output[700]: 0.00184533
+Output[800]: 0.000202116
+Output[900]: 0.000585591
+```
+
+

 3、示例源码`mobilenet_full_api.cc`


--- a/docs/index.rst
+++ b/docs/index.rst
@@ -54,6 +54,9 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/opencl
  demo_guides/fpga
  demo_guides/npu
+  demo_guides/baidu_xpu
+  demo_guides/rockchip_npu
+  demo_guides/mediatek_apu
  
 .. toctree::
  :maxdepth: 1

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -16,7 +16,6 @@ message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
 message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
-message(STATUS "LITE_WITH_ARM_LANG:\t${LITE_WITH_ARM_LANG}")

 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
 set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
@@ -188,15 +187,17 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
            )
-        add_custom_target(publish_inference_third_party ${TARGET}
-                COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-                COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+        if (LITE_WITH_CUDA)
+            add_custom_target(publish_inference_third_party ${TARGET}
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+                    COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
+            add_dependencies(publish_inference publish_inference_third_party)
+        endif()
        add_dependencies(publish_inference_cxx_lib bundle_full_api)
        add_dependencies(publish_inference_cxx_lib bundle_light_api)
        add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
        add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
        add_dependencies(publish_inference publish_inference_cxx_lib)
-        add_dependencies(publish_inference publish_inference_third_party)
    endif()
 endif()

@@ -238,9 +239,13 @@ if (LITE_WITH_X86)
    add_dependencies(publish_inference_x86_cxx_lib test_model_bin)

    add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
+           COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
+           COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
           COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
-           )
+           COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/mklml" "${INFER_LITE_PUBLISH_ROOT}/third_party/"
+       )
    add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
    add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
    add_dependencies(publish_inference publish_inference_x86_cxx_lib)
@@ -369,6 +374,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
  lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
  lite_cc_library(place SRCS paddle_place.cc DEPS glog)

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -48,6 +48,7 @@ USE_LITE_OP(concat)
 USE_LITE_OP(conv2d)
 USE_LITE_OP(depthwise_conv2d)
 USE_LITE_OP(pool2d)
+USE_LITE_OP(max_pool2d_with_index)
 USE_LITE_OP(batch_norm)
 USE_LITE_OP(fusion_elementwise_sub_activation)
 USE_LITE_OP(transpose)

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -151,6 +151,11 @@ std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }

+// get param names
+std::vector<std::string> Predictor::GetParamNames() {
+  return exec_scope_->AttributeVarNames();
+}
+
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
  if (!program_) {
@@ -293,6 +298,7 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
  for (auto &valid_place : valid_places) {
+    if (valid_place.target == TARGET(kOpenCL)) continue;
    inner_places.emplace_back(
        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
  }
@@ -345,9 +351,16 @@ void Predictor::GenRuntimeProgram() {

 const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
  return &var->Get<lite::Tensor>();
 }

+lite::Tensor *Predictor::GetMutableTensor(const std::string &name) {
+  auto *var = exec_scope_->FindVar(name);
+  CHECK(var) << "no variable named with " << name << " in exec_scope";
+  return var->GetMutable<lite::Tensor>();
+}
+
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  auto element = std::find(input_names_.begin(), input_names_.end(), name);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -85,6 +85,9 @@ class LITE_API Predictor {
  // get inputnames and get outputnames.
  std::vector<std::string> GetInputNames();
  std::vector<std::string> GetOutputNames();
+  // get param names
+  std::vector<std::string> GetParamNames();
+
  void PrepareFeedFetch();

  // Get offset-th col of fetch results.
@@ -92,6 +95,9 @@ class LITE_API Predictor {
  std::vector<const lite::Tensor*> GetOutputs() const;

  const cpp::ProgramDesc& program_desc() const;
+  // get a mutable tensor according to its name
+  lite::Tensor* GetMutableTensor(const std::string& name);
+  // get a const tensor according to its name
  const lite::Tensor* GetTensor(const std::string& name) const;
  const RuntimeProgram& runtime_program() const;

@@ -142,9 +148,15 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  // get inputs names and get outputs names
  std::vector<std::string> GetInputNames() override;
  std::vector<std::string> GetOutputNames() override;
+  // get param names
+  std::vector<std::string> GetParamNames() override;

+  // get tensor according to tensor's name
  std::unique_ptr<const lite_api::Tensor> GetTensor(
      const std::string& name) const override;
+  // get a mutable tensor according to tensor's name
+  std::unique_ptr<lite_api::Tensor> GetMutableTensor(
+      const std::string& name) override;

  // Get InputTebsor by name
  std::unique_ptr<lite_api::Tensor> GetInputByName(

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -97,6 +97,10 @@ std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
  return raw_predictor_.GetInputNames();
 }

+std::vector<std::string> CxxPaddleApiImpl::GetParamNames() {
+  return raw_predictor_.GetParamNames();
+}
+
 std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
  return raw_predictor_.GetOutputNames();
 }
@@ -123,6 +127,12 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
 }

+std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetMutableTensor(
+    const std::string &name) {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_.GetMutableTensor(name)));
+}
+
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
    const std::string &name) {
  return std::unique_ptr<lite_api::Tensor>(

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
 DEFINE_string(input_shape_0,
              "1,3,224,224",
              "input shapes another, separated by colon and comma");
-
+DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
-  config.set_valid_places({
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
+  if (FLAGS_target == "arm") {
+    config.set_valid_places({
+        Place{TARGET(kARM), PRECISION(kFloat)},
+    });
+  } else if (FLAGS_target == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        Place{TARGET(kARM)},  // enable kARM CPU kernel when no opencl kernel
+    });
+  }
  auto predictor = lite_api::CreatePaddlePredictor(config);

  // delete old optimized model
@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         int tid,
         const int warmup_times = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

  auto predictor = lite_api::CreatePaddlePredictor(config);

-  config.set_model_dir(model_dir_0);
+  config.set_model_from_file(model_dir_0 + ".nb");
  auto predictor_0 = lite_api::CreatePaddlePredictor(config);

  for (int i = 0; i < 2 * repeat; i += 2) {
@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_model_dir == "") {
    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
+              << "--model_dir /path/to/your/model --model_dir_0 "
+                 "/path/to/your/model0  --target `arm` or `opencl`";
    exit(0);
  }
  std::string save_optimized_model_dir = "";

--- a/lite/api/opt.cc
+++ b/lite/api/opt.cc
@@ -55,7 +55,7 @@ DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
    optimize_out_type,
-    "protobuf",
+    "naive_buffer",
    "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
 DEFINE_bool(record_tailoring_info,
@@ -207,7 +207,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
  }
  std::cout << std::setiosflags(std::ios::internal);
  std::cout << std::setw(maximum_optype_length) << "OP_name";
-  for (int i = 0; i < targets.size(); i++) {
+  for (size_t i = 0; i < targets.size(); i++) {
    std::cout << std::setw(10) << targets[i].substr(1);
  }
  std::cout << std::endl;
@@ -215,7 +215,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
    for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
      std::cout << std::setw(maximum_optype_length) << it->first;
      auto ops_valid_places = it->second;
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -235,7 +235,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
      }
      // Print OP info.
      auto ops_valid_places = supported_ops.at(*op);
-      for (int i = 0; i < targets.size(); i++) {
+      for (size_t i = 0; i < targets.size(); i++) {
        if (std::find(ops_valid_places.begin(),
                      ops_valid_places.end(),
                      targets[i]) != ops_valid_places.end()) {
@@ -288,11 +288,11 @@ void ParseInputCommand() {
    auto valid_places = paddle::lite_api::ParserValidPlaces();
    // get valid_targets string
    std::vector<TargetType> target_types = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      target_types.push_back(valid_places[i].target);
    }
    std::string targets_str = TargetToStr(target_types[0]);
-    for (int i = 1; i < target_types.size(); i++) {
+    for (size_t i = 1; i < target_types.size(); i++) {
      targets_str = targets_str + TargetToStr(target_types[i]);
    }

@@ -301,7 +301,7 @@ void ParseInputCommand() {
    target_types.push_back(TARGET(kUnk));

    std::set<std::string> valid_ops;
-    for (int i = 0; i < target_types.size(); i++) {
+    for (size_t i = 0; i < target_types.size(); i++) {
      auto ops = supported_ops_target[static_cast<int>(target_types[i])];
      valid_ops.insert(ops.begin(), ops.end());
    }
@@ -318,7 +318,7 @@ void CheckIfModelSupported() {
  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
  valid_ops.insert(
      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
-  for (int i = 0; i < valid_places.size(); i++) {
+  for (size_t i = 0; i < valid_places.size(); i++) {
    auto target = valid_places[i].target;
    auto ops = supported_ops_target[static_cast<int>(target)];
    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
@@ -340,7 +340,7 @@ void CheckIfModelSupported() {

  std::set<std::string> unsupported_ops;
  std::set<std::string> input_model_ops;
-  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+  for (size_t index = 0; index < cpp_prog.BlocksSize(); index++) {
    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
@@ -364,13 +364,13 @@ void CheckIfModelSupported() {
      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
    }
    std::vector<TargetType> targets = {};
-    for (int i = 0; i < valid_places.size(); i++) {
+    for (size_t i = 0; i < valid_places.size(); i++) {
      targets.push_back(valid_places[i].target);
    }
    std::sort(targets.begin(), targets.end());
    targets.erase(unique(targets.begin(), targets.end()), targets.end());
    std::string targets_str = TargetToStr(targets[0]);
-    for (int i = 1; i < targets.size(); i++) {
+    for (size_t i = 1; i < targets.size(); i++) {
      targets_str = targets_str + "," + TargetToStr(targets[i]);
    }


--- a/lite/api/opt_base.cc
+++ b/lite/api/opt_base.cc
@@ -82,27 +82,56 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
         "command argument 'valid_targets'";
 }

-void OptBase::SetOptimizeOut(const std::string& optimized_out_path) {
-  optimize_out_path_ = optimized_out_path;
+void OptBase::SetLiteOut(const std::string& lite_out_name) {
+  lite_out_name_ = lite_out_name;
 }

-void OptBase::RunOptimize(bool record_strip_info) {
+void OptBase::RecordModelInfo(bool record_strip_info) {
+  record_strip_info_ = record_strip_info;
+}
+
+void OptBase::Run() {
  CheckIfModelSupported(false);
  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
  opt_config_.set_valid_places(valid_places_);
  if (model_set_dir_ != "") {
-    RunOptimizeFromModelSet(record_strip_info);
+    RunOptimizeFromModelSet(record_strip_info_);
  } else {
    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info_);
    auto resulted_model_name =
-        record_strip_info ? "information of striped model" : "optimized model";
+        record_strip_info_ ? "information of striped model" : "optimized model";
    std::cout << "Save the " << resulted_model_name
-              << " into :" << optimize_out_path_ << "successfully";
+              << " into :" << lite_out_name_ << "successfully";
  }
 }

+void OptBase::RunOptimize(const std::string& model_dir_path,
+                          const std::string& model_path,
+                          const std::string& param_path,
+                          const std::string& valid_places,
+                          const std::string& optimized_out_path) {
+  SetModelDir(model_dir_path);
+  SetModelFile(model_path);
+  SetParamFile(param_path);
+  SetValidPlaces(valid_places);
+  SetLiteOut(optimized_out_path);
+  CheckIfModelSupported(false);
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
+  opt_config_.set_valid_places(valid_places_);
+  if (model_set_dir_ != "") {
+    RunOptimizeFromModelSet(record_strip_info_);
+  } else {
+    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
+    opt_predictor->SaveOptimizedModel(
+        lite_out_name_, model_type_, record_strip_info_);
+    auto resulted_model_name =
+        record_strip_info_ ? "information of striped model" : "optimized model";
+    std::cout << "Save the " << resulted_model_name
+              << " into :" << lite_out_name_ << "successfully";
+  }
+}
 // collect ops info of modelset
 void CollectModelMetaInfo(const std::string& output_dir,
                          const std::vector<std::string>& models,
@@ -125,7 +154,7 @@ void OptBase::SetModelSetDir(const std::string& model_set_path) {
 }
 void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  // 1. mkdir of outputed optimized model set.
-  lite::MkDirRecur(optimize_out_path_);
+  lite::MkDirRecur(lite_out_name_);
  auto model_dirs = lite::ListDir(model_set_dir_, true);
  if (model_dirs.size() == 0) {
    LOG(FATAL) << "[" << model_set_dir_ << "] does not contain any model";
@@ -138,7 +167,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
    std::string input_model_dir =
        lite::Join<std::string>({model_set_dir_, name}, "/");
    std::string output_model_dir =
-        lite::Join<std::string>({optimize_out_path_, name}, "/");
+        lite::Join<std::string>({lite_out_name_, name}, "/");

    if (opt_config_.model_file() != "" && opt_config_.param_file() != "") {
      auto model_file_path =
@@ -155,7 +184,7 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {

    auto opt_predictor = lite_api::CreatePaddlePredictor(opt_config_);
    opt_predictor->SaveOptimizedModel(
-        optimize_out_path_, model_type_, record_strip_info);
+        lite_out_name_, model_type_, record_strip_info);

    std::cout << "Optimize done. ";
  }
@@ -164,46 +193,60 @@ void OptBase::RunOptimizeFromModelSet(bool record_strip_info) {
  if (record_strip_info) {
    // Collect all models information
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+    CollectModelMetaInfo(
+        lite_out_name_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_OPS_LIST_NAME);
-    CollectModelMetaInfo(optimize_out_path_,
-                         model_dirs,
-                         lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
    CollectModelMetaInfo(
-        optimize_out_path_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
+        lite_out_name_, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
    std::cout << "Record the information of stripped models into :"
-              << optimize_out_path_ << "successfully";
+              << lite_out_name_ << "successfully";
  }
 }

 void OptBase::PrintHelpInfo() {
  const std::string opt_version = lite::version();
  const char help_info[] =
-      "At least one argument should be inputed. Valid arguments are listed "
-      "below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
+      "  Valid arguments of Paddle-Lite opt are listed below:\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n"
      "  Arguments of help information:\n"
      "        `help()`   Print help infomation\n"
-      "  Arguments of model optimization:\n"
+      "\n"
+      "  Arguments of model transformation:\n"
      "        `set_model_dir(model_dir)`\n"
      "        `set_model_file(model_file_path)`\n"
      "        `set_param_file(param_file_path)`\n"
-      "        `set_model_type(protobuf|naive_buffer)`\n"
-      "        `set_optimize_out(output_optimize_model_dir)`\n"
+      "        `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
+      "default\n"
+      "        `set_lite_out(output_optimize_model_dir)`\n"
      "        `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
-      "        `run_optimize(false|true)`\n"
-      "        `  ----fasle&true refer to whether to record ops info for "
-      "tailoring lib, false by default`\n"
-      "  Arguments of model checking and ops information:\n"
+      "        `record_model_info(false|true)`: refer to whether to record ops "
+      "info for striping lib, false by default`\n"
+      "        `run() : start model transformation`\n"
+      "    eg. `opt.set_model_dir(\"./mobilenetv1\"); "
+      "opt.set_lite_out(\"mobilenetv1_opt\"); opt.set_valid_places(\"arm\"); "
+      "opt.run();`\n"
+      "\n"
+      "  You can also transform model through a single input argument:\n"
+      "        `run_optimize(model_dir, model_file_path, param_file_path, "
+      "model_type, valid_places, lite_out_name) `\n"
+      "    eg. `opt.run_optimize(\"./mobilenetv1\", \"\", \"\", "
+      "\"naive_buffer\", \"arm\", \"mobilenetv1_opt\");`"
+      "\n"
+      "  Arguments of checking model and printing ops information:\n"
      "        `print_all_ops()`   Display all the valid operators of "
      "Paddle-Lite\n"
      "        `print_supported_ops`   Display supported operators of valid "
      "places\n"
      "        `check_if_model_supported()`   Check if the input model is "
-      "supported\n";
-
-  std::cout << "opt version:" << opt_version << std::endl
-            << help_info << std::endl;
+      "supported\n"
+      "------------------------------------------------------------------------"
+      "-----------------------------------------------------------\n";
+  std::cout << "opt version:" << opt_version << std::endl << help_info;
 }
 // 2. Print supported info of inputed ops
 void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {

--- a/lite/api/opt_base.h
+++ b/lite/api/opt_base.h
@@ -44,16 +44,21 @@ class LITE_API OptBase {
 public:
  OptBase() = default;
  void SetModelSetDir(const std::string &model_set_path);
-  void SetModelDir(const std::string &model_path);
+  void SetModelDir(const std::string &model_dir_path);
  void SetModelFile(const std::string &model_path);
  void SetParamFile(const std::string &param_path);
  void SetValidPlaces(const std::string &valid_places);
-  void SetOptimizeOut(const std::string &optimized_out_path);
+  void SetLiteOut(const std::string &lite_out_name);
+  void RecordModelInfo(bool record_strip_info = true);
  // set optimized_model type
  void SetModelType(std::string model_type);
  // transform and save the optimized model
-  void RunOptimize(bool record_strip_info = false);
-
+  void Run();
+  void RunOptimize(const std::string &model_dir_path = "",
+                   const std::string &model_path = "",
+                   const std::string &param_path = "",
+                   const std::string &valid_places = "",
+                   const std::string &optimized_out_path = "");
  // fuctions of printing info
  // 1. help info
  void PrintHelpInfo();
@@ -71,12 +76,12 @@ class LITE_API OptBase {
  // valid places for the optimized_model
  std::vector<Place> valid_places_;
  // filename of the optimized_model
-  std::string optimize_out_path_;
+  std::string lite_out_name_;
  // type of the optimized_model, kNaiveBuffer default.
  LiteModelType model_type_{LiteModelType::kNaiveBuffer};
  // Dir path of a set of models, this should be combined with model
  std::string model_set_dir_;
-
+  bool record_strip_info_{false};
  void RunOptimizeFromModelSet(bool record_strip_info = false);
 };


--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -167,6 +167,20 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }

 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }

+std::unique_ptr<Tensor> PaddlePredictor::GetMutableTensor(
+    const std::string &name) {
+  LOG(FATAL)
+      << "The GetMutableTensor API is only supported by CxxConfig predictor.";
+  return nullptr;
+}
+
+std::vector<std::string> PaddlePredictor::GetParamNames() {
+  std::vector<std::string> null_result = {};
+  LOG(FATAL)
+      << "The GetParamNames API is only supported by CxxConfig predictor.";
+  return null_result;
+}
+
 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
                                         LiteModelType model_type,
                                         bool record_info) {

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -86,6 +86,8 @@ class LITE_API PaddlePredictor {
  virtual std::vector<std::string> GetInputNames() = 0;
  // Get output names
  virtual std::vector<std::string> GetOutputNames() = 0;
+  // Get output names
+  virtual std::vector<std::string> GetParamNames();

  // Get Input by name
  virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;
@@ -93,6 +95,9 @@ class LITE_API PaddlePredictor {
  /// Get a readonly tensor, return null if no one called `name` exists.
  virtual std::unique_ptr<const Tensor> GetTensor(
      const std::string& name) const = 0;
+  /// Get a mutable tensor, return null if on one called `name` exists
+  /// internal infereces API, not recommanded.
+  virtual std::unique_ptr<Tensor> GetMutableTensor(const std::string& name);

  /// Persist the optimized model to disk. This API is only supported by
  /// CxxConfig, and the persisted model can be reused for MobileConfig.
@@ -176,7 +181,7 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
-  int multi_stream() const { return multi_stream_; }
+  bool multi_stream() const { return multi_stream_; }
 #endif

 #ifdef LITE_WITH_MLU
@@ -208,6 +213,8 @@ class LITE_API CxxConfig : public ConfigBase {
  // current thread.
  void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
  // XPU only, specify the target device ID for the current thread.
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  void set_xpu_dev_per_thread(int dev_no = 0);
 };


--- a/lite/api/paddle_lite_factory_helper.h
+++ b/lite/api/paddle_lite_factory_helper.h
@@ -19,7 +19,13 @@
 #pragma once

 // some platform-independent defintion
-#include "lite/utils/macros.h"
+
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif

 #define USE_LITE_OP(op_type__)       \
  extern int touch_op_##op_type__(); \

--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -161,6 +161,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                               TARGET(kBM),
                                               TARGET(kMLU),
                                               TARGET(kAPU),
+                                               TARGET(kRKNPU),
                                               TARGET(kFPGA)});
  if (target == TARGET(kAny)) {
    return valid_set;

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -33,6 +33,7 @@ USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
 USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
+USE_MIR_PASS(identity_dropout_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
 USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
@@ -51,5 +52,8 @@ USE_MIR_PASS(mlu_postprocess_pass);
 USE_MIR_PASS(weight_quantization_preprocess_pass);
 USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
+USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
+USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
+USE_MIR_PASS(__xpu__fc_fuse_pass);
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -62,8 +62,10 @@ void BindLiteOpt(py::module *m) {
      .def("set_model_file", &OptBase::SetModelFile)
      .def("set_param_file", &OptBase::SetParamFile)
      .def("set_valid_places", &OptBase::SetValidPlaces)
-      .def("set_optimize_out", &OptBase::SetOptimizeOut)
+      .def("set_lite_out", &OptBase::SetLiteOut)
      .def("set_model_type", &OptBase::SetModelType)
+      .def("record_model_info", &OptBase::RecordModelInfo)
+      .def("run", &OptBase::Run)
      .def("run_optimize", &OptBase::RunOptimize)
      .def("help", &OptBase::PrintHelpInfo)
      .def("print_supported_ops", &OptBase::PrintSupportedOps)

--- a/lite/api/python/setup.py.in
+++ b/lite/api/python/setup.py.in
@@ -33,11 +33,17 @@ else:
    PADDLELITE_VERSION = PADDLELITE_TAG

 # core lib of paddlelite is stored as lite.so
-LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
+files = os.listdir('${PADDLE_BINARY_DIR}')
+INFERENCE_LITE_LIB_PATH = ''
+for file in files:
+    if file.find('inference_lite_lib') == 0:
+        INFERENCE_LITE_LIB_PATH = '${PADDLE_BINARY_DIR}/' + file
+        break
+LITE_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/lite'
 PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
 # put all thirdparty libraries in paddlelite.libs
 PACKAGE_DATA['paddlelite.libs'] = []
-LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
+LIB_PATH = INFERENCE_LITE_LIB_PATH + '/python/install/libs/'
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
    shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
@@ -49,8 +55,7 @@ if '${WITH_MKL}' == 'ON':
        PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
 # link lite.so to paddlelite.libs
 if os.name != 'nt':
-    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
-    /inference_lite_lib/python/install/lite/lite.so"
+    COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' " + LITE_PATH + "/lite.so"
    if os.system(COMMAND) != 0:
        raise Exception("patch third_party libs failed, command: %s" % COMMAND)


--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -15,6 +15,7 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <fstream>
+#include <thread>  //NOLINT
 #include <vector>
 #include "lite/api/cxx_api.h"
 #include "lite/api/paddle_use_kernels.h"
@@ -30,14 +31,18 @@ DEFINE_string(input_img_txt_path,
 namespace paddle {
 namespace lite {

-void TestModel(const std::vector<Place>& valid_places) {
+const int g_batch_size = 1;
+const int g_thread_num = 1;
+
+void instance_run() {
  lite::Predictor predictor;
  std::vector<std::string> passes;
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
-
  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(
-      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>(
+      {g_batch_size, 3, FLAGS_im_height, FLAGS_im_width})));
  auto* data = input_tensor->mutable_data<float>();
  auto item_size = input_tensor->dims().production();
  if (FLAGS_input_img_txt_path.empty()) {
@@ -45,12 +50,15 @@ void TestModel(const std::vector<Place>& valid_places) {
      data[i] = 1;
    }
  } else {
-    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
-    if (!fs.is_open()) {
-      LOG(FATAL) << "open input_img_txt error.";
-    }
-    for (int i = 0; i < item_size; i++) {
-      fs >> data[i];
+    for (int j = 0; j < g_batch_size; j++) {
+      std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+      if (!fs.is_open()) {
+        LOG(FATAL) << "open input_img_txt error.";
+      }
+      for (int i = 0; i < item_size / g_batch_size; i++) {
+        fs >> data[i];
+      }
+      data += j * item_size / g_batch_size;
    }
  }
  for (int i = 0; i < FLAGS_warmup; ++i) {
@@ -72,6 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
  FILE* fp = fopen("result.txt", "wb");
  for (int i = 0; i < out.size(); i++) {
    auto* out_data = out[i]->data<float>();
+    LOG(INFO) << out[i]->numel();
    for (int j = 0; j < out[i]->numel(); j++) {
      fprintf(fp, "%f\n", out_data[j]);
    }
@@ -79,6 +88,16 @@ void TestModel(const std::vector<Place>& valid_places) {
  fclose(fp);
 }

+void TestModel(const std::vector<Place>& valid_places) {
+  std::vector<std::unique_ptr<std::thread>> instances_vec;
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec.emplace_back(new std::thread(&instance_run));
+  }
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec[i]->join();
+  }
+}
+
 TEST(Classify, test_bm) {
  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});

--- a/lite/backends/apu/CMakeLists.txt
+++ b/lite/backends/apu/CMakeLists.txt
@@ -2,4 +2,5 @@ if(NOT LITE_WITH_APU)
  return()
 endif()

-lite_cc_library(device_apu SRCS device.cc)
+lite_cc_library(neuron_adapter SRCS neuron_adapter.cc)
+lite_cc_library(device_apu SRCS device.cc DEPS neuron_adapter)
--- a/lite/backends/apu/device.cc
+++ b/lite/backends/apu/device.cc
@@ -20,48 +20,19 @@ namespace paddle {
 namespace lite {
 namespace apu {

-inline void* LoadFunc(void* libHandle, const char* name) {
-  CHECK(libHandle != nullptr);
-  CHECK(name != nullptr);
-  void* fn = dlsym(libHandle, name);
-  if (fn == nullptr) {
-    LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
-                 << "] Because " << dlerror();
-  }
-  return fn;
-}
-
-NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
-  typedef int (*NeuronCompilation_create)(NeuronModel * model,
-                                          NeuronCompilation * *compilation);
-  typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
-  typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
-
-#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
-  FUNC_NAME VARIABLE_NAME =                                 \
-      reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
-  LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
-  LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
-  LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
-#undef LOAD_FUNCTIONS
-
-  int neuron_errCode = 0;
-  NeuronCompilation* compilation = NULL;
-
+NeuronCompilation* Device::Build(NeuronModel* model) {
  VLOG(3) << "[APU] Compile model";
-
-  neuron_errCode = (*neuron_compilation_create)(model, &compilation);
+  NeuronCompilation* compilation = NULL;
+  int neuron_errCode = NeuronCompilation_create(model, &compilation);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
    return nullptr;
  }
-
-  neuron_errCode = (*neuron_compilation_finish)(compilation);
+  neuron_errCode = NeuronCompilation_finish(compilation);
  if (NEURON_NO_ERROR != neuron_errCode) {
    LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
    return nullptr;
  }
-
  VLOG(3) << "[APU] Build done";
  return compilation;
 }

--- a/lite/backends/apu/device.h
+++ b/lite/backends/apu/device.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "NeuronAdapter.h"  // NOLINT
+#include "lite/backends/apu/neuron_adapter.h"

 namespace paddle {
 namespace lite {
@@ -32,7 +32,7 @@ class Device {
  }
  Device() {}

-  NeuronCompilation* Build(void* libHandle, NeuronModel* model);
+  NeuronCompilation* Build(NeuronModel* model);
 };

 }  // namespace apu

--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/apu/neuron_adapter.h"
+#include <dlfcn.h>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+NeuronAdapter* NeuronAdapter::Global() {
+  static NeuronAdapter adapter;
+  return &adapter;
+}
+
+NeuronAdapter::NeuronAdapter() {
+  CHECK(InitHandle()) << "Fail to initialize the Neuron Adapter library!";
+  InitFunctions();
+}
+
+bool NeuronAdapter::InitHandle() {
+  const std::vector<std::string> paths = {
+    "libneuron_adapter.so",
+#if defined(__aarch64__)
+    "/vendor/lib64/libneuron_adapter.so",
+    "/system/lib64/libneuron_adapter.so",
+    "/system/vendor/lib64/libneuron_adapter.so",
+#else
+    "/vendor/lib/libneuron_adapter.so",
+    "/system/lib/libneuron_adapter.so",
+    "/system/vendor/lib/libneuron_adapter.so",
+#endif
+  };
+  std::string target_lib = "Unknown";
+  for (auto path : paths) {
+    handle_ = dlopen(path.c_str(), RTLD_LAZY);
+    if (handle_ != nullptr) {
+      target_lib = path;
+      break;
+    }
+  }
+  VLOG(4) << "Load the Neuron Adapter library from " << target_lib;
+  if (handle_ != nullptr) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void NeuronAdapter::InitFunctions() {
+  CHECK(handle_ != nullptr) << "The library handle can't be null!";
+
+#define PADDLE_DLSYM(neuron_adapter_func)                                 \
+  do {                                                                    \
+    neuron_adapter_func##_ =                                              \
+        (neuron_adapter_func##_Type)dlsym(handle_, #neuron_adapter_func); \
+    if (neuron_adapter_func##_ == nullptr) {                              \
+      LOG(FATAL) << "Cannot find the " << #neuron_adapter_func            \
+                 << " symbol in libneuron_adapter.so!";                   \
+      break;                                                              \
+    }                                                                     \
+    VLOG(4) << "Loaded the " << #neuron_adapter_func                      \
+            << " symbol successfully.";                                   \
+  } while (false)
+
+  PADDLE_DLSYM(Neuron_getVersion);
+  PADDLE_DLSYM(NeuronModel_create);
+  PADDLE_DLSYM(NeuronModel_free);
+  PADDLE_DLSYM(NeuronModel_finish);
+  PADDLE_DLSYM(NeuronModel_addOperand);
+  PADDLE_DLSYM(NeuronModel_setOperandValue);
+  PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
+  PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
+  PADDLE_DLSYM(NeuronCompilation_create);
+  PADDLE_DLSYM(NeuronCompilation_free);
+  PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronExecution_create);
+  PADDLE_DLSYM(NeuronExecution_free);
+  PADDLE_DLSYM(NeuronExecution_setInput);
+  PADDLE_DLSYM(NeuronExecution_setOutput);
+  PADDLE_DLSYM(NeuronExecution_compute);
+
+#undef PADDLE_DLSYM
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+int Neuron_getVersion(uint32_t* version) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getVersion()(version);
+}
+
+int NeuronModel_create(NeuronModel** model) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_create()(model);
+}
+
+void NeuronModel_free(NeuronModel* model) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_free()(model);
+}
+
+int NeuronModel_finish(NeuronModel* model) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_finish()(model);
+}
+
+int NeuronModel_addOperand(NeuronModel* model, const NeuronOperandType* type) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperand()(model,
+                                                                         type);
+}
+
+int NeuronModel_setOperandValue(NeuronModel* model,
+                                int32_t index,
+                                const void* buffer,
+                                size_t length) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_setOperandValue()(
+      model, index, buffer, length);
+}
+
+int NeuronModel_setOperandSymmPerChannelQuantParams(
+    NeuronModel* model,
+    int32_t index,
+    const NeuronSymmPerChannelQuantParams* channelQuant) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_setOperandSymmPerChannelQuantParams()(
+          model, index, channelQuant);
+}
+
+int NeuronModel_addOperation(NeuronModel* model,
+                             NeuronOperationType type,
+                             uint32_t inputCount,
+                             const uint32_t* inputs,
+                             uint32_t outputCount,
+                             const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronModel_addOperation()(
+      model, type, inputCount, inputs, outputCount, outputs);
+}
+
+int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
+                                         uint32_t inputCount,
+                                         const uint32_t* inputs,
+                                         uint32_t outputCount,
+                                         const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_identifyInputsAndOutputs()(
+          model, inputCount, inputs, outputCount, outputs);
+}
+
+int NeuronCompilation_create(NeuronModel* model,
+                             NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_create()(
+      model, compilation);
+}
+
+void NeuronCompilation_free(NeuronCompilation* compilation) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_free()(
+      compilation);
+}
+
+int NeuronCompilation_finish(NeuronCompilation* compilation) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronCompilation_finish()(
+      compilation);
+}
+
+int NeuronExecution_create(NeuronCompilation* compilation,
+                           NeuronExecution** execution) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
+      compilation, execution);
+}
+
+void NeuronExecution_free(NeuronExecution* execution) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_free()(
+      execution);
+}
+
+int NeuronExecution_setInput(NeuronExecution* execution,
+                             int32_t index,
+                             const NeuronOperandType* type,
+                             const void* buffer,
+                             size_t length) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setInput()(
+      execution, index, type, buffer, length);
+}
+
+int NeuronExecution_setOutput(NeuronExecution* execution,
+                              int32_t index,
+                              const NeuronOperandType* type,
+                              void* buffer,
+                              size_t length) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_setOutput()(
+      execution, index, type, buffer, length);
+}
+
+int NeuronExecution_compute(NeuronExecution* execution) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
+      execution);
+}
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "NeuronAdapter.h"  // NOLINT
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+class NeuronAdapter final {
+ public:
+  static NeuronAdapter *Global();
+  // Platform APIs
+  using Neuron_getVersion_Type = int (*)(uint32_t *);
+  using NeuronModel_create_Type = int (*)(NeuronModel **);
+  using NeuronModel_free_Type = void (*)(NeuronModel *);
+  using NeuronModel_finish_Type = int (*)(NeuronModel *);
+  using NeuronModel_addOperand_Type = int (*)(NeuronModel *,
+                                              const NeuronOperandType *);
+  using NeuronModel_setOperandValue_Type = int (*)(NeuronModel *,
+                                                   int32_t,
+                                                   const void *,
+                                                   size_t);
+  using NeuronModel_setOperandSymmPerChannelQuantParams_Type =
+      int (*)(NeuronModel *, int32_t, const NeuronSymmPerChannelQuantParams *);
+  using NeuronModel_addOperation_Type = int (*)(NeuronModel *,
+                                                NeuronOperationType,
+                                                uint32_t,
+                                                const uint32_t *,
+                                                uint32_t,
+                                                const uint32_t *);
+  using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
+      NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
+  using NeuronCompilation_create_Type = int (*)(NeuronModel *,
+                                                NeuronCompilation **);
+  using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
+  using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
+                                              NeuronExecution **);
+  using NeuronExecution_free_Type = void (*)(NeuronExecution *);
+  using NeuronExecution_setInput_Type = int (*)(NeuronExecution *,
+                                                int32_t,
+                                                const NeuronOperandType *,
+                                                const void *,
+                                                size_t);
+  using NeuronExecution_setOutput_Type = int (*)(
+      NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
+  using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+
+  Neuron_getVersion_Type Neuron_getVersion() {
+    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
+    return Neuron_getVersion_;
+  }
+
+  NeuronModel_create_Type NeuronModel_create() {
+    CHECK(NeuronModel_create_ != nullptr) << "Cannot load NeuronModel_create!";
+    return NeuronModel_create_;
+  }
+
+  NeuronModel_free_Type NeuronModel_free() {
+    CHECK(NeuronModel_free_ != nullptr) << "Cannot load NeuronModel_free!";
+    return NeuronModel_free_;
+  }
+
+  NeuronModel_finish_Type NeuronModel_finish() {
+    CHECK(NeuronModel_finish_ != nullptr) << "Cannot load NeuronModel_finish!";
+    return NeuronModel_finish_;
+  }
+
+  NeuronModel_addOperand_Type NeuronModel_addOperand() {
+    CHECK(NeuronModel_addOperand_ != nullptr)
+        << "Cannot load NeuronModel_addOperand!";
+    return NeuronModel_addOperand_;
+  }
+
+  NeuronModel_setOperandValue_Type NeuronModel_setOperandValue() {
+    CHECK(NeuronModel_setOperandValue_ != nullptr)
+        << "Cannot load NeuronModel_setOperandValue!";
+    return NeuronModel_setOperandValue_;
+  }
+
+  NeuronModel_setOperandSymmPerChannelQuantParams_Type
+  NeuronModel_setOperandSymmPerChannelQuantParams() {
+    CHECK(NeuronModel_setOperandSymmPerChannelQuantParams_ != nullptr)
+        << "Cannot load NeuronModel_setOperandSymmPerChannelQuantParams!";
+    return NeuronModel_setOperandSymmPerChannelQuantParams_;
+  }
+
+  NeuronModel_addOperation_Type NeuronModel_addOperation() {
+    CHECK(NeuronModel_addOperation_ != nullptr)
+        << "Cannot load NeuronModel_addOperation!";
+    return NeuronModel_addOperation_;
+  }
+
+  NeuronModel_identifyInputsAndOutputs_Type
+  NeuronModel_identifyInputsAndOutputs() {
+    CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
+        << "Cannot load NeuronModel_identifyInputsAndOutputs!";
+    return NeuronModel_identifyInputsAndOutputs_;
+  }
+
+  NeuronCompilation_create_Type NeuronCompilation_create() {
+    CHECK(NeuronCompilation_create_ != nullptr)
+        << "Cannot load NeuronCompilation_create!";
+    return NeuronCompilation_create_;
+  }
+
+  NeuronCompilation_free_Type NeuronCompilation_free() {
+    CHECK(NeuronCompilation_free_ != nullptr)
+        << "Cannot load NeuronCompilation_free!";
+    return NeuronCompilation_free_;
+  }
+
+  NeuronCompilation_finish_Type NeuronCompilation_finish() {
+    CHECK(NeuronCompilation_finish_ != nullptr)
+        << "Cannot load NeuronCompilation_finish!";
+    return NeuronCompilation_finish_;
+  }
+
+  NeuronExecution_create_Type NeuronExecution_create() {
+    CHECK(NeuronExecution_create_ != nullptr)
+        << "Cannot load NeuronExecution_create!";
+    return NeuronExecution_create_;
+  }
+
+  NeuronExecution_free_Type NeuronExecution_free() {
+    CHECK(NeuronExecution_free_ != nullptr)
+        << "Cannot load NeuronExecution_free!";
+    return NeuronExecution_free_;
+  }
+
+  NeuronExecution_setInput_Type NeuronExecution_setInput() {
+    CHECK(NeuronExecution_setInput_ != nullptr)
+        << "Cannot loadcl NeuronExecution_setInput!";
+    return NeuronExecution_setInput_;
+  }
+
+  NeuronExecution_setOutput_Type NeuronExecution_setOutput() {
+    CHECK(NeuronExecution_setOutput_ != nullptr)
+        << "Cannot load NeuronExecution_setOutput!";
+    return NeuronExecution_setOutput_;
+  }
+
+  NeuronExecution_compute_Type NeuronExecution_compute() {
+    CHECK(NeuronExecution_compute_ != nullptr)
+        << "Cannot load NeuronExecution_compute!";
+    return NeuronExecution_compute_;
+  }
+
+ private:
+  NeuronAdapter();
+  NeuronAdapter(const NeuronAdapter &) = delete;
+  NeuronAdapter &operator=(const NeuronAdapter &) = delete;
+  bool InitHandle();
+  void InitFunctions();
+  void *handle_{nullptr};
+  Neuron_getVersion_Type Neuron_getVersion_{nullptr};
+  NeuronModel_create_Type NeuronModel_create_{nullptr};
+  NeuronModel_free_Type NeuronModel_free_{nullptr};
+  NeuronModel_finish_Type NeuronModel_finish_{nullptr};
+  NeuronModel_addOperand_Type NeuronModel_addOperand_{nullptr};
+  NeuronModel_setOperandValue_Type NeuronModel_setOperandValue_{nullptr};
+  NeuronModel_setOperandSymmPerChannelQuantParams_Type
+      NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
+  NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_identifyInputsAndOutputs_Type
+      NeuronModel_identifyInputsAndOutputs_{nullptr};
+  NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
+  NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
+  NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronExecution_create_Type NeuronExecution_create_{nullptr};
+  NeuronExecution_free_Type NeuronExecution_free_{nullptr};
+  NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
+  NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
+  NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+};
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
  int tile_h = (hout + 5) / 6;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
                          const operators::ConvParam& param,
                          ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,
@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
                                const operators::ConvParam& param,
                                ARMContext* ctx) {
  auto act_param = param.activation_param;
-  const int pad_h = (*param.paddings)[0];
-  const int pad_w = (*param.paddings)[2];
+  const int pad_h0 = (*param.paddings)[0];
+  const int pad_h1 = (*param.paddings)[1];
+  const int pad_w0 = (*param.paddings)[2];
+  const int pad_w1 = (*param.paddings)[3];
  float* tmp_work_space =
      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);

@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
  int tile_h = (hout + 1) / 2;
  int size_tile = tile_h * tile_w;

-  int w_pad = win + pad_w * 2;
-  int h_pad = hin + pad_h * 2;
+  int w_pad = win + pad_w0 + pad_w1;
+  int h_pad = hin + pad_h0 + pad_h1;

  const int zero_len = w_pad;
  float zero_ptr[zero_len];  // NOLINT
@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
      prepack_input_nxwc4_dw(input + ni * in_n_stride,
                             input_c4 + i * new_c_stride,
                             i * 4,
-                             -pad_h,
-                             hin + pad_h,
-                             -pad_w,
-                             win + pad_w,
+                             -pad_h0,
+                             hin + pad_h1,
+                             -pad_w0,
+                             win + pad_w1,
                             chin,
                             win,
                             hin,

--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -21,6 +21,17 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+
+int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 void pooling_basic(const float* din,
                   float* dout,
                   int num,
@@ -88,15 +99,27 @@ void pooling_basic(const float* din,
 #pragma omp parallel for
      for (int ind_c = 0; ind_c < chin; ++ind_c) {
        for (int ind_h = 0; ind_h < hout; ++ind_h) {
-          int sh = ind_h * stride_h;
-          int eh = sh + kernel_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          int sh, eh;
+          if (adaptive) {
+            sh = AdaptStartIndex(ind_h, hin, hout);
+            eh = AdaptEndIndex(ind_h, hin, hout);
+          } else {
+            sh = ind_h * stride_h;
+            eh = sh + kernel_h;
+            sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+            eh = (eh - pad_h) > hin ? hin : eh - pad_h;
+          }
          for (int ind_w = 0; ind_w < wout; ++ind_w) {
-            int sw = ind_w * stride_w;
-            int ew = sw + kernel_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > win ? win : ew - pad_w;
+            int sw, ew;
+            if (adaptive) {
+              sw = AdaptStartIndex(ind_w, win, wout);
+              ew = AdaptEndIndex(ind_w, win, wout);
+            } else {
+              sw = ind_w * stride_w;
+              ew = sw + kernel_w;
+              sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+              ew = (ew - pad_w) > win ? win : ew - pad_w;
+            }
            float result = static_cast<float>(0);
            int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
                          ind_h * wout + ind_w;

--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
--- a/lite/backends/arm/math/scale.h
+++ b/lite/backends/arm/math/scale.h
@@ -40,6 +40,15 @@ void scale_compute_basic(const operators::ScaleParam& param) {
 template <typename T>
 void scale(const T* din, T* dout, int num, T scale, T bias);

+template <typename T>
+void scale_relu(const T* din, T* dout, int num, T scale, T bias);
+
+template <typename T>
+void scale_relu6(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
+template <typename T>
+void scale_leaky_relu(const T* din, T* dout, int num, T scale, T bias, T alpha);
+
 template <typename T>
 void scale(const T* din,
           T* dout,

--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
@@ -24,16 +24,17 @@ std::map<int, void*> TargetWrapperBM::bm_hds_;

 size_t TargetWrapperBM::num_devices() {
  int count = 0;
-  bm_dev_getcount(&count);
+  bm_status_t ret = bm_dev_getcount(&count);
+  CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
+                            << static_cast<int>(ret);
  return count;
 }

+int TargetWrapperBM::GetDevice() { return device_id_; }
 void TargetWrapperBM::SetDevice(int id) {
-  /*
-    if (id < 0 || (size_t)id >= num_devices()) {
-      LOG(FATAL) << "Failed with invalid device id " << id;
-    }
-  */
+  if (id < 0 || (size_t)id >= num_devices()) {
+    LOG(FATAL) << "Failed with invalid device id " << id;
+  }
  device_id_ = id;
  if (bm_hds_.find(id) == bm_hds_.end()) {
    bm_handle_t bm_handle;

--- a/lite/backends/bm/target_wrapper.h
+++ b/lite/backends/bm/target_wrapper.h
@@ -31,6 +31,7 @@ class TargetWrapper<TARGET(kBM)> {
  static size_t maximum_stream() { return 0; }

  static void SetDevice(int id);
+  static int GetDevice();
  static void CreateStream(stream_t* stream) {}
  static void DestroyStream(const stream_t& stream) {}


--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -129,6 +129,26 @@ bool CLRuntime::InitializePlatform() {
  return true;
 }

+GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
+  const std::string kMALI_PATTERN_STR = "Mali";
+  const std::string kADRENO_PATTERN_STR = "QUALCOMM Adreno(TM)";
+  const std::string kPOWERVR_PATTERN_STR = "PowerVR";
+
+  if (device_name == kADRENO_PATTERN_STR) {
+    LOG(INFO) << "adreno gpu";
+    return GpuType::QUALCOMM_ADRENO;
+  } else if (device_name.find(kMALI_PATTERN_STR) != std::string::npos) {
+    LOG(INFO) << "mali gpu";
+    return GpuType::ARM_MALI;
+  } else if (device_name.find(kPOWERVR_PATTERN_STR) != std::string::npos) {
+    LOG(INFO) << "powerVR gpu";
+    return GpuType::IMAGINATION_POWERVR;
+  } else {
+    LOG(INFO) << "others gpu";
+    return GpuType::UNKNOWN;
+  }
+}
+
 bool CLRuntime::InitializeDevice() {
  // ===================== BASIC =====================
  // CL_DEVICE_TYPE_GPU
@@ -148,6 +168,7 @@ bool CLRuntime::InitializeDevice() {

  auto device_name = device_->getInfo<CL_DEVICE_NAME>();
  LOG(INFO) << "Using device: " << device_name;
+  gpu_type_ = ParseGpuTypeFromDeviceName(device_name);

  cl_device_type device_type = device_->getInfo<CL_DEVICE_TYPE>();
  auto device_type_to_str = [](cl_device_type t) -> std::string {
@@ -296,5 +317,53 @@ std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
  return device_info_;
 }

+void CLRuntime::GetAdrenoContextProperties(
+    std::vector<cl_context_properties>* properties,
+    GPUPerfMode gpu_perf_mode,
+    GPUPriorityLevel gpu_priority_level) {
+  CHECK(properties) << "cl_context_properties is nullptr";
+  properties->reserve(5);
+  switch (gpu_perf_mode) {
+    case GPUPerfMode::PERF_LOW:
+      LOG(INFO) << "GPUPerfMode::PERF_LOW";
+      properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
+      properties->push_back(CL_PERF_MODE_LOW_QCOM);
+      break;
+    case GPUPerfMode::PERF_NORMAL:
+      LOG(INFO) << "GPUPerfMode::PERF_NORMAL";
+      properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
+      properties->push_back(CL_PERF_MODE_NORMAL_QCOM);
+      break;
+    case GPUPerfMode::PERF_HIGH:
+      LOG(INFO) << "GPUPerfMode::PERF_HIGH";
+      properties->push_back(CL_CONTEXT_PERF_MODE_QCOM);
+      properties->push_back(CL_PERF_MODE_HIGH_QCOM);
+      break;
+    default:
+      break;
+  }
+  switch (gpu_priority_level) {
+    case GPUPriorityLevel::PRIORITY_LOW:
+      LOG(INFO) << "GPUPriorityLevel::PRIORITY_LOW";
+      properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
+      properties->push_back(CL_PRIORITY_HINT_LOW_QCOM);
+      break;
+    case GPUPriorityLevel::PRIORITY_NORMAL:
+      LOG(INFO) << "GPUPriorityLevel::PRIORITY_NORMAL";
+      properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
+      properties->push_back(CL_PRIORITY_HINT_NORMAL_QCOM);
+      break;
+    case GPUPriorityLevel::PRIORITY_HIGH:
+      LOG(INFO) << "GPUPriorityLevel::PRIORITY_HIGH";
+      properties->push_back(CL_CONTEXT_PRIORITY_LEVEL_QCOM);
+      properties->push_back(CL_PRIORITY_HINT_HIGH_QCOM);
+      break;
+    default:
+      break;
+  }
+  // The properties list should be terminated with 0
+  properties->push_back(0);
+}
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/opencl/cl_runtime.h
+++ b/lite/backends/opencl/cl_runtime.h
@@ -19,6 +19,45 @@ limitations under the License. */
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/backends/opencl/cl_utility.h"

+typedef enum {
+  UNKNOWN = 0,
+  QUALCOMM_ADRENO = 1,
+  ARM_MALI = 2,
+  IMAGINATION_POWERVR = 3,
+  OTHERS = 4,
+} GpuType;
+
+typedef enum {
+  PERF_DEFAULT = 0,
+  PERF_LOW = 1,
+  PERF_NORMAL = 2,
+  PERF_HIGH = 3
+} GPUPerfMode;
+
+typedef enum {
+  PRIORITY_DEFAULT = 0,
+  PRIORITY_LOW = 1,
+  PRIORITY_NORMAL = 2,
+  PRIORITY_HIGH = 3
+} GPUPriorityLevel;
+
+// Adreno extensions
+// Adreno performance hints
+typedef cl_uint cl_perf_hint;
+#define CL_CONTEXT_PERF_MODE_QCOM 0x40C2
+#define CL_PERF_MODE_HIGH_QCOM 0x40C3
+#define CL_PERF_MODE_NORMAL_QCOM 0x40C4
+#define CL_PERF_MODE_LOW_QCOM 0x40C5
+
+// Adreno priority hints
+typedef cl_uint cl_priority_hint;
+
+#define CL_PRIORITY_HINT_NONE_QCOM 0
+#define CL_CONTEXT_PRIORITY_LEVEL_QCOM 0x40C9
+#define CL_PRIORITY_HINT_HIGH_QCOM 0x40CA
+#define CL_PRIORITY_HINT_NORMAL_QCOM 0x40CB
+#define CL_PRIORITY_HINT_LOW_QCOM 0x40CC
+
 namespace paddle {
 namespace lite {

@@ -63,9 +102,28 @@ class CLRuntime {

  bool InitializeDevice();

+  void GetAdrenoContextProperties(
+      std::vector<cl_context_properties>* properties,
+      GPUPerfMode gpu_perf_mode,
+      GPUPriorityLevel gpu_priority_level);
+
  std::shared_ptr<cl::Context> CreateContext() {
-    auto context = std::make_shared<cl::Context>(
-        std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
+    // note(ysh329): gpu perf mode and priority level of adreno gpu referred
+    // from xiaomi/mace.
+    // However, no performance gain after `PERF_HIGH` and `PRIORITY_HIGH` set.
+    auto perf_mode = GPUPerfMode::PERF_HIGH;
+    auto priority_level = GPUPriorityLevel::PRIORITY_HIGH;
+    std::vector<cl_context_properties> context_properties;
+    if (gpu_type_ == GpuType::QUALCOMM_ADRENO) {
+      GetAdrenoContextProperties(
+          &context_properties, perf_mode, priority_level);
+    }
+    auto context =
+        std::make_shared<cl::Context>(std::vector<cl::Device>{device()},
+                                      context_properties.data(),
+                                      nullptr,
+                                      nullptr,
+                                      &status_);
    CL_CHECK_FATAL(status_);
    return context;
  }
@@ -83,8 +141,12 @@ class CLRuntime {
    return queue;
  }

+  GpuType ParseGpuTypeFromDeviceName(std::string device_name);
+
  std::map<std::string, size_t> device_info_;

+  GpuType gpu_type_{GpuType::UNKNOWN};
+
  std::string cl_path_;

  std::shared_ptr<cl::Platform> platform_{nullptr};

--- a/lite/backends/opencl/cl_utility.h
+++ b/lite/backends/opencl/cl_utility.h
@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
        __FILE__,                                                    \
        __LINE__);                                                   \
  }
-#ifndef LITE_SHUTDOWN_LOG
+#ifdef LITE_WITH_LOG
 #define CL_CHECK_FATAL(err_code__)                                   \
  if (err_code__ != CL_SUCCESS) {                                    \
    LOG(FATAL) << string_format(                                     \

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -129,8 +129,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
-        output_data[i * in_dims[0] + j] =
-            input_data[i * in_dims[0] + j] + vector_data[j];
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
      }
    }
  }

--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -279,7 +279,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
      }
    }
    if (has_value_input == nullptr) {
-      VLOG(3) << "no input has value! just return" << std::endl;
+      VLOG(3) << "no input has value! just return";
      return;
    }
    auto input_width = has_value_input->value().dims()[1];

--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -19,6 +19,7 @@ namespace lite {

 #ifdef LITE_WITH_XPU
 thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
+int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
 #endif

 }  // namespace lite

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -110,9 +110,7 @@ class Context<TargetType::kBM> {
  Context() {}
  explicit Context(const BMContext& ctx);
  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() { Init(0); }
-
-  void Init(int dev_id) { TargetWrapperBM::SetDevice(dev_id); }
+  void InitOnce() { TargetWrapperBM::SetDevice(TargetWrapperBM::GetDevice()); }
  void CopySharedTo(BMContext* ctx) {}
  void* GetHandle() { return TargetWrapperBM::GetHandle(); }

@@ -151,14 +149,23 @@ class Context<TargetType::kXPU> {
    if (_tls_raw_ctx == nullptr) {
      _tls_raw_ctx = xdnn::create_context();
      CHECK(_tls_raw_ctx);
+      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
+                                          _workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", _workspace_l3_size_per_thread = "
+                     << _workspace_l3_size_per_thread;
+      }
    }
    return _tls_raw_ctx;
  }

  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    xdnn::set_workspace_l3_size(GetRawContext(), l3_size);
+    _workspace_l3_size_per_thread = l3_size;
  }

+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
  static void SetDev(int dev_no = 0) {
    const char* dev_env = getenv("LITE_XPU_DEV");
    if (dev_env) {
@@ -173,6 +180,7 @@ class Context<TargetType::kXPU> {

 private:
  static thread_local xdnn::Context* _tls_raw_ctx;
+  static int _workspace_l3_size_per_thread;
 };
 #endif


--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -1240,6 +1240,19 @@ void Device<TARGET(kMLU)>::CreateQueue() {
 }
 #endif  // LITE_WITH_MLU

+#ifdef LITE_WITH_BM
+void Device<TARGET(kBM)>::SetId(int device_id) {
+  LOG(INFO) << "Set bm device " << device_id;
+  TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
+  idx_ = device_id;
+}
+
+void Device<TARGET(kBM)>::Init() { SetId(idx_); }
+int Device<TARGET(kBM)>::core_num() {
+  return TargetWrapper<TARGET(kBM)>::num_devices();
+}
+#endif  // LITE_WITH_BM
+
 #ifdef LITE_WITH_CUDA

 void Device<TARGET(kCUDA)>::Init() {

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -221,6 +221,49 @@ class Device<TARGET(kMLU)> {
 template class Env<TARGET(kMLU)>;
 #endif  // LITE_WITH_MLU

+#ifdef LITE_WITH_BM
+template <>
+class Device<TARGET(kBM)> {
+ public:
+  Device(int dev_id, int max_stream = 1)
+      : idx_(dev_id), max_stream_(max_stream) {}
+  void Init();
+
+  int id() { return idx_; }
+  int max_stream() { return 1; }
+  std::string name() { return "BM"; }
+  float max_memory() { return 16; }
+  int core_num();
+  void SetId(int idx);
+
+  int sm_version() { return 0; }
+  bool has_fp16() { return false; }
+  bool has_int8() { return false; }
+  bool has_hmma() { return false; }
+  bool has_imma() { return false; }
+  int runtime_version() { return 0; }
+
+ private:
+  void CreateQueue() {}
+  void GetInfo() {}
+
+ private:
+  int idx_{0};
+  int max_stream_{1};
+  std::string device_name_;
+  float max_memory_;
+
+  int sm_version_;
+  bool has_fp16_;
+  bool has_int8_;
+  bool has_hmma_;
+  bool has_imma_;
+  int runtime_version_;
+};
+
+template class Env<TARGET(kBM)>;
+#endif
+
 #ifdef LITE_WITH_CUDA
 template <>
 class Device<TARGET(kCUDA)> {

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -21,9 +21,13 @@ lite_cc_library(mir_passes
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
      fusion/sequence_pool_concat_fuse_pass.cc
+      fusion/scale_activation_fuse_pass.cc
      fusion/__xpu__resnet_fuse_pass.cc
      fusion/__xpu__multi_encoder_fuse_pass.cc
+      fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+      fusion/__xpu__fc_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
+      elimination/identity_dropout_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc
      variable_place_inference_pass.cc

--- a/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_dropout_eliminate_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace {
+
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("dropout", "X");
+    auto* dropout_op = OpNode("dropout", "dropout")
+                           ->assert_op_attr<int>("is_test", 1)
+                           ->assert_op_attr<std::string>(
+                               "dropout_implementation", "upscale_in_train");
+    auto* out = VarNode("out")->assert_is_op_output("dropout", "Out");
+    auto* mask = VarNode("mask")->assert_is_op_output("dropout", "Mask");
+
+    *pre_op >> *x >> *dropout_op >> *out;
+    *dropout_op >> *mask;
+
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+    dropout_op->AsIntermediate();
+    mask->AsIntermediate();
+  }
+
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+
+}  // namespace
+
+class IdentityDropoutEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(identity_dropout_eliminate_pass,
+                  paddle::lite::mir::IdentityDropoutEliminatePass)
+    .BindTargets({TARGET(kXPU)});
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
@@ -26,7 +26,9 @@ class Eliminator : public FuseBase {
 public:
  void BuildPattern() override {
    // the previous op's output need updat
-    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
+    auto* pre_op = OpNode("preop")
+                       ->assert_is_not_op_type("conditional_block")
+                       ->assert_is_not_op_type("scale");
    // TODO(Superjomn) check has only one output
    auto* x = VarNode("x")->assert_is_op_input("scale", "X");
    auto* scale_op = OpNode("scale", "scale")

--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -31,6 +31,9 @@ lite_cc_library(fuse_interpolate
 lite_cc_library(fuse_sequence_pool_concat
        SRCS sequence_pool_concat_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_scale_activation
+        SRCS scale_activation_fuser.cc
+        DEPS pattern_matcher_high_api)

 set(mir_fusers
    fuse_fc
@@ -44,6 +47,7 @@ set(mir_fusers
    fuse_transpose_softmax_transpose
    fuse_interpolate
    fuse_sequence_pool_concat
+    fuse_scale_activation
    CACHE INTERNAL "fusers")

 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace fusion {
+
+class XPUEmbeddingWithEltwiseAddFuser : public FuseBase {
+ public:
+  explicit XPUEmbeddingWithEltwiseAddFuser(int n_embedding)
+      : n_embedding_(n_embedding) {}
+
+  void BuildPattern() override {
+    auto* ids0 =
+        VarNode("ids0")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table0 =
+        VarNode("table0")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding0 = OpNode("embedding0", "lookup_table");
+    auto* embedding_out0 = VarNode("embedding_out0")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "X")
+                               ->AsIntermediate();
+
+    auto* ids1 =
+        VarNode("ids1")->assert_is_op_input("lookup_table", "Ids")->AsInput();
+    auto* table1 =
+        VarNode("table1")->assert_is_op_input("lookup_table", "W")->AsInput();
+    auto* embedding1 = OpNode("embedding1", "lookup_table")->AsIntermediate();
+    auto* embedding_out1 = VarNode("embedding_out1")
+                               ->assert_is_op_output("lookup_table", "Out")
+                               ->assert_is_op_input("elementwise_add", "Y")
+                               ->AsIntermediate();
+
+    auto* ewadd01 = OpNode("ewadd01", "elementwise_add")->AsIntermediate();
+    auto* ewadd01_out = VarNode("ewadd01_out")
+                            ->assert_is_op_output("elementwise_add", "Out")
+                            ->AsIntermediate();
+
+    embedding0->LinksFrom({ids0, table0});
+    embedding0->LinksTo({embedding_out0});
+    embedding1->LinksFrom({ids1, table1});
+    embedding1->LinksTo({embedding_out1});
+    ewadd01->LinksFrom({embedding_out0, embedding_out1});
+    ewadd01->LinksTo({ewadd01_out});
+
+    auto* last_ewadd_out = ewadd01_out;
+    for (int i = 2; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      auto embedding_name = paddle::lite::string_format("embedding%d", i);
+      auto embedding_out_name =
+          paddle::lite::string_format("embedding_out%d", i);
+
+      auto* new_ids = VarNode(ids_name)
+                          ->assert_is_op_input("lookup_table", "Ids")
+                          ->AsInput();
+      auto* new_table = VarNode(table_name)
+                            ->assert_is_op_input("lookup_table", "W")
+                            ->AsInput();
+      auto* new_embedding =
+          OpNode(embedding_name, "lookup_table")->AsIntermediate();
+      auto* new_embedding_out = VarNode(embedding_out_name)
+                                    ->assert_is_op_output("lookup_table", "Out")
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsIntermediate();
+
+      new_embedding->LinksFrom({new_ids, new_table});
+      new_embedding->LinksTo({new_embedding_out});
+
+      auto ewadd_name = paddle::lite::string_format("ewadd%d%d", i - 1, i);
+      auto ewadd_out_name = ewadd_name + "_out";
+
+      auto* new_ewadd = OpNode(ewadd_name, "elementwise_add")->AsIntermediate();
+      auto* new_ewadd_out = VarNode(ewadd_out_name)
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+
+      new_ewadd->LinksFrom({last_ewadd_out, new_embedding_out});
+      new_ewadd->LinksTo({new_ewadd_out});
+      last_ewadd_out = new_ewadd_out;
+    }
+    last_ewadd_out->AsOutput();
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__embedding_with_eltwise_add");
+    std::vector<std::string> ids_names;
+    std::vector<std::string> table_names;
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      ids_names.push_back(matched.at(ids_name)->arg()->name);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      table_names.push_back(matched.at(table_name)->arg()->name);
+    }
+    op_desc.SetInput("Ids", ids_names);
+    op_desc.SetInput("Tables", table_names);
+    auto output_name = paddle::lite::string_format(
+        "ewadd%d%d_out", n_embedding_ - 2, n_embedding_ - 1);
+    op_desc.SetOutput("Output", {matched.at(output_name)->arg()->name});
+    op_desc.SetAttr<int>("n_embedding", n_embedding_);
+    auto* embedding0_op_info = matched.at("embedding0")->stmt()->op_info();
+    op_desc.SetAttr<int64_t>(
+        "padding_idx", embedding0_op_info->GetAttr<int64_t>("padding_idx"));
+
+    auto* new_stmt = matched.at("embedding0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    for (int i = 0; i < n_embedding_; ++i) {
+      auto ids_name = paddle::lite::string_format("ids%d", i);
+      auto table_name = paddle::lite::string_format("table%d", i);
+      DirectedLink(matched.at(ids_name), matched.at("embedding0"));
+      DirectedLink(matched.at(table_name), matched.at("embedding0"));
+    }
+    IR_OP_VAR_LINK(matched.at("embedding0"), matched.at(output_name));
+  }
+
+ private:
+  int n_embedding_;
+};
+
+}  // namespace fusion
+
+class XPUEmbeddingWithEltwiseAddFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    for (int n_embedding : {4, 3}) {
+      fusion::XPUEmbeddingWithEltwiseAddFuser fuser(n_embedding);
+      fuser(graph.get());
+    }
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass,
+                  paddle::lite::mir::XPUEmbeddingWithEltwiseAddFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("lookup_table");
--- a/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__fc_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUFcFuser : public FuseBase {
+ public:
+  explicit XPUFcFuser(bool with_relu) : with_relu_(with_relu) {}
+
+  void BuildPattern() override {
+    // create nodes.
+    auto* x = VarNode("x")->assert_is_op_input("mul", "X");
+    auto* W = VarNode("W")->assert_is_op_input("mul", "Y");
+    auto* b = VarNode("b")->assert_is_persistable_var();
+    auto* mul = OpNode("mul", "mul");
+    auto* mul_out = VarNode("mul_out");
+    auto* add = OpNode("add", "elementwise_add");
+    auto* Out = VarNode("Out");
+
+    // create topology.
+    std::vector<PMNode*> mul_inputs{W, x};
+    std::vector<PMNode*> add_inputs{mul_out, b};
+    mul_inputs >> *mul >> *mul_out;
+
+    // Some op specialities.
+    mul_out->AsIntermediate();
+    mul->AsIntermediate();
+    add->AsIntermediate();
+
+    if (with_relu_) {
+      auto* add_out = VarNode("add_out");
+      auto* relu = OpNode("relu", "relu");
+      std::vector<PMNode*> relu_inputs{add_out};
+      add_inputs >> *add >> *add_out;
+      relu_inputs >> *relu >> *Out;
+      add_out->AsIntermediate();
+      relu->AsIntermediate();
+    } else {
+      add_inputs >> *add >> *Out;
+    }
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto mul = matched.at("mul")->stmt()->op();
+    auto* scope = mul->scope();
+
+    // convert W from float to int16, and transpose W
+    auto weight_name = matched.at("W")->arg()->name;
+    auto* weight_t = scope->FindMutableTensor(weight_name);
+    auto weight_dims = weight_t->dims();
+    int weight_len = weight_t->numel();
+    float* weight_on_host = weight_t->mutable_data<float>();
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+
+    std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+    std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        weight_on_host, weight_int16.get(), max_f, weight_len);
+    paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                       weight_trans_int16.get(),
+                                       weight_dims[0],
+                                       weight_dims[1]);
+    memcpy(
+        weight_on_host, weight_trans_int16.get(), weight_len * sizeof(int16_t));
+
+    auto op_desc = GenOpDesc(matched, max_f, true);
+    auto fc_op = LiteOpRegistry::Global().Create("__xpu__fc");
+    auto& valid_places = mul->valid_places();
+    fc_op->Attach(op_desc, scope);
+
+    auto* new_op_node = graph->GraphCreateInstructNode(fc_op, valid_places);
+
+    IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+    IR_NODE_LINK_TO(matched.at("b"), new_op_node);
+    IR_NODE_LINK_TO(new_op_node, matched.at("Out"));
+  }
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched,
+                        float w_max,
+                        bool transpose_w) {
+    cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+    op_desc.mutable_inputs()->clear();
+    op_desc.mutable_outputs()->clear();
+    op_desc.SetType("__xpu__fc");
+    op_desc.SetInput("Input", {matched.at("x")->arg()->name});
+    op_desc.SetInput("W", {matched.at("W")->arg()->name});
+    op_desc.SetInput("Bias", {matched.at("b")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("Out")->arg()->name});
+    op_desc.SetAttr(
+        "in_num_col_dims",
+        matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+    op_desc.SetAttr("w_max", w_max);
+    op_desc.SetAttr("transpose_w", transpose_w);
+    if (with_relu_) {
+      op_desc.SetAttr("activation_type", std::string{"relu"});
+    }
+    return op_desc;
+  }
+
+  bool with_relu_;
+};
+
+}  // namespace fusion
+
+class XPUFcFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUFcFuser fuser(true /* with_relu */);
+    fuser(graph.get());
+
+    fusion::XPUFcFuser fuser2(false /* with_relu */);
+    fuser2(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__fc_fuse_pass, paddle::lite::mir::XPUFcFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("fc");
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "lite/backends/xpu/math.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/type_precision_cast_pass.h"  // For UpdateInputs()
 #include "lite/core/mir/xpu_pattern_matcher_high_api.h"
 #include "lite/operators/subgraph_op.h"

@@ -588,8 +589,7 @@ class XPUMultiEncoderFuser {
    multi_encoder_stmt->SetOp(multi_encoder_op);
    multi_encoder_stmt->SetKernels(std::move(kernels));

-    // temp remove useless cast
-    std::unordered_set<const Node*> to_remove2;
+    // remove dangling/useless cast
    Node* stack = nullptr;
    for (auto* node : graph->StmtTopologicalOrder()) {
      CHECK(node->IsStmt());
@@ -597,16 +597,39 @@ class XPUMultiEncoderFuser {
        stack = node;
      }
    }
-    Node* stack_out = stack->outlinks.front();
-    for (Node* cast : stack_out->outlinks) {
-      Node* cast_out = cast->outlinks.front();
-      if (cast_out->outlinks.size() == 0) {
-        // remove
-        to_remove2.insert(cast_out);
-        to_remove2.insert(cast);
+    if (stack) {
+      std::unordered_set<const Node*> to_remove2;
+      Node* stack_out = stack->outlinks.front();
+      // avoid modification while traversing
+      auto stack_out_outlinks = stack_out->outlinks;
+      for (Node* cast : stack_out_outlinks) {
+        if (cast->stmt()->op_info()->Type() != "cast") {
+          continue;
+        }
+
+        Node* cast_out = cast->outlinks.front();
+        if (cast_out->outlinks.size() == 0) {
+          // dangling cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove dangling cast [" << cast_out->arg()->name << "]";
+        } else if (cast_out->outlinks.size() == 1) {
+          // useless cast
+          to_remove2.insert(cast);
+          to_remove2.insert(cast_out);
+          VLOG(3) << "Remove useless cast [" << cast_out->arg()->name << "]";
+
+          auto* multi_encoder = cast_out->outlinks.front();
+          DirectedLink(stack_out, multi_encoder);
+          UpdateInputs(multi_encoder->stmt()->op().get(),
+                       cast_out->arg()->name,
+                       stack_out->arg()->name);
+          auto update_op_info = *multi_encoder->stmt()->op_info();
+          multi_encoder->stmt()->ResetOp(update_op_info, graph->valid_places());
+        }
      }
+      GraphSafeRemoveNodes(graph, to_remove2);
    }
-    GraphSafeRemoveNodes(graph, to_remove2);
  }
 };


--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
+  auto groups = conv_op_desc->GetAttr<int>("groups");
+  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
+    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-             static_cast<size_t>(conv_weight_t->dims()[1]))
+             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
        << "dim size of the conv weights";
  } else {
@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    // compute new conv_weight for int8
    auto weight_scale =
        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];

--- a/lite/core/mir/fusion/interpolate_fuse_pass.cc
+++ b/lite/core/mir/fusion/interpolate_fuse_pass.cc
@@ -23,11 +23,15 @@ namespace lite {
 namespace mir {

 void InterpolateFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::InterpolateFuser bilinear_interp_fuser("bilinear_interp");
-  bilinear_interp_fuser(graph.get());
+  std::vector<std::string> Interpolate_type_cases{"bilinear_interp",
+                                                  "nearest_interp"};
+  for (auto type_ : Interpolate_type_cases) {
+    fusion::InterpolateFuser interp_fuser(type_);
+    interp_fuser(graph.get());

-  fusion::InterpolateFuser nearest_interp_fuser("nearest_interp");
-  nearest_interp_fuser(graph.get());
+    fusion::InterpolateFuser2 interp_fuser2(type_);
+    interp_fuser2(graph.get());
+  }
 }

 }  // namespace mir

--- a/lite/core/mir/fusion/interpolate_fuser.cc
+++ b/lite/core/mir/fusion/interpolate_fuser.cc
@@ -22,6 +22,9 @@ namespace mir {
 namespace fusion {

 void InterpolateFuser::BuildPattern() {
+  // type1             fill_constant -->
+  // x --> shape --> slice --> cast --> elementwise_mul --> interpolate
+  //   `-------------------------------------------------->
  auto* x = VarNode("x");
  auto* shape = OpNode("shape", "shape")->AsIntermediate();
  auto* shape_out = VarNode("shape_out")->AsIntermediate();
@@ -89,6 +92,64 @@ cpp::OpDesc InterpolateFuser::GenOpDesc(const key2nodes_t& matched) {
  return op_desc;
 }

+void InterpolateFuser2::BuildPattern() {
+  // type2 x --> shape --> slice --> cast --> scale --> interpolate
+  //        `---------------------------------------->
+  auto* x = VarNode("x");
+  auto* shape = OpNode("shape", "shape")->AsIntermediate();
+  auto* shape_out = VarNode("shape_out")->AsIntermediate();
+  auto* slice = OpNode("slice", "slice")
+                    ->assert_op_attr_satisfied<std::vector<int>>(
+                        "axes",
+                        [](const std::vector<int>& attr) {
+                          return attr.size() == 1 && attr[0] == 0;
+                        })
+                    ->assert_op_attr_satisfied<std::vector<int>>(
+                        "starts",
+                        [](const std::vector<int>& attr) {
+                          return attr.size() == 1 && attr[0] == 2;
+                        })
+                    ->assert_op_attr_satisfied<std::vector<int>>(
+                        "ends",
+                        [](const std::vector<int>& attr) {
+                          return attr.size() == 1 && attr[0] == 4;
+                        })
+                    ->AsIntermediate();
+  auto* slice_out = VarNode("slice_out")->AsIntermediate();
+  auto* cast = OpNode("cast", "cast")->AsIntermediate();
+  auto* cast_out = VarNode("cast_out")->AsIntermediate();
+  auto* scale = OpNode("scale", "scale")->AsIntermediate();
+  auto* scale_out = VarNode("scale_out")->AsIntermediate();
+  auto* interpolate = OpNode("interpolate", interp_type_)->AsIntermediate();
+  auto* interpolate_out = VarNode("interpolate_out");
+
+  // create topology.
+  *x >> *shape >> *shape_out >> *slice >> *slice_out >> *cast >> *cast_out >>
+      *scale >> *scale_out >> *interpolate >> *interpolate_out;
+  *x >> *interpolate;
+}
+
+void InterpolateFuser2::InsertNewNode(SSAGraph* graph,
+                                      const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto interp_op = LiteOpRegistry::Global().Create(interp_type_);
+  auto interp_old = matched.at("interpolate")->stmt()->op();
+  auto* scope = interp_old->scope();
+  auto& valid_places = interp_old->valid_places();
+  interp_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(interp_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("interpolate_out"));
+}
+
+cpp::OpDesc InterpolateFuser2::GenOpDesc(const key2nodes_t& matched) {
+  auto op_desc = *matched.at("interpolate")->stmt()->op_info();
+  op_desc.SetInput("OutSize", {});
+  return op_desc;
+}
+
 }  // namespace fusion
 }  // namespace mir
 }  // namespace lite

--- a/lite/core/mir/fusion/interpolate_fuser.h
+++ b/lite/core/mir/fusion/interpolate_fuser.h
@@ -36,6 +36,19 @@ class InterpolateFuser : public FuseBase {
  std::string interp_type_;
 };

+class InterpolateFuser2 : public FuseBase {
+ public:
+  explicit InterpolateFuser2(const std::string& interp_type)
+      : interp_type_(interp_type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string interp_type_;
+};
+
 }  // namespace fusion
 }  // namespace mir
 }  // namespace lite

--- a/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ScaleActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  for (auto act_type : {"relu", "relu6", "leaky_relu"}) {
+    fusion::ScaleActivationFuser fuser(act_type);
+    fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_scale_activation_fuse_pass,
+                  paddle::lite::mir::ScaleActivationFusePass)
+    .BindTargets({TARGET(kARM)})
+    .BindKernel("scale");
--- a/lite/core/mir/fusion/scale_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/scale_activation_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ScaleActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.cc
+++ b/lite/core/mir/fusion/scale_activation_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/scale_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ScaleActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("scale", "X")->AsInput();
+
+  // create op nodes
+  auto* scale =
+      OpNode("scale", "scale")->assert_is_op("scale")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+
+  // create intermediate nodes
+  auto* scale_out = VarNode("scale_out")
+                        ->assert_is_op_output("scale", "Out")
+                        ->assert_is_op_input(act_type_, "X")
+                        ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  *x >> *scale >> *scale_out;
+  *scale_out >> *act >> *out;
+}
+
+void ScaleActivationFuser::InsertNewNode(SSAGraph* graph,
+                                         const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto scale_op = LiteOpRegistry::Global().Create("scale");
+  auto scale = matched.at("scale")->stmt()->op();
+  auto* scope = scale->scope();
+  auto& valid_places = scale->valid_places();
+  scale_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(scale_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc ScaleActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("scale")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+
+  op_desc.SetAttr("activation_type", act_type_);
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  } else if (act_type_ == "relu6") {
+    float alpha = act_op_desc.GetAttr<float>("threshold");
+    op_desc.SetAttr("alpha", alpha);
+  } else if (act_type_ == "leaky_relu") {
+    float alpha = act_op_desc.GetAttr<float>("alpha");
+    op_desc.SetAttr("alpha", alpha);
+  }
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/scale_activation_fuser.h
+++ b/lite/core/mir/fusion/scale_activation_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ScaleActivationFuser : public FuseBase {
+ public:
+  explicit ScaleActivationFuser(const std::string& act_type) {
+    act_type_ = act_type;
+  }
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -260,6 +260,9 @@ class KernelRegistry final {
              KernelRegistryForTarget<TARGET(kRKNPU),
                                      PRECISION(kAny),
                                      DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kRKNPU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kNCHW)> *,  //
              KernelRegistryForTarget<TARGET(kRKNPU),
                                      PRECISION(kFloat),
                                      DATALAYOUT(kNCHW)> *,  //

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -71,12 +71,17 @@ class Optimizer {
           "identity_scale_eliminate_pass",               //
           "elementwise_mul_constant_eliminate_pass",     //
           "lite_sequence_pool_concat_fuse_pass",         //
+           "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //
 #endif
           "__xpu__resnet_fuse_pass",
           "__xpu__multi_encoder_fuse_pass",
+           "__xpu__embedding_with_eltwise_add_fuse_pass",
+           "__xpu__fc_fuse_pass",
+           "identity_dropout_eliminate_pass",         // should be placed after
+                                                      // xpu fusion
           "quantized_op_attributes_inference_pass",  // Only for fully
                                                      // quantized model, infer
                                                      // the output scale and

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/fluid/float16.h"

 #ifdef LITE_WITH_OPENCL
 #include "lite/backends/opencl/cl_image_converter.h"
@@ -52,6 +53,24 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
  return true;
 }

+static bool write_precision_summary_tofile(const std::string& string,
+                                           const std::string& log_dir = "") {
+  if (log_dir == "") {
+    LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
+              << log_dir;
+    return false;
+  }
+  FILE* fp = fopen(log_dir.c_str(), "a");
+  if (fp == nullptr) {
+    LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
+    return false;
+  } else {
+    fprintf(fp, "%s\n", string.c_str());
+  }
+  fclose(fp);
+  return true;
+}
+
 class PrecisionProfiler {
 public:
  // TODO(ysh329): need to remove `explicit PrecisionProfiler`
@@ -67,7 +86,7 @@ class PrecisionProfiler {
    using std::left;
    using std::fixed;
    STL::stringstream ss;
-    ss << "========================================= "
+    ss << "\n\n========================================= "
       << "Detailed Precision Profiler Summary "
       << "=========================================" << std::endl;
    ss << setw(45) << left << "operator:(kernel_info)"
@@ -77,6 +96,13 @@ class PrecisionProfiler {
       << " " << setw(15) << left << "std_deviation"
       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;

+    // write to file with path: `log_dir`
+    if (log_dir_ != "") {
+      FILE* fp = fopen(log_dir_.c_str(), "a");
+      std::string header_str{ss.str()};
+      fprintf(fp, "%s\n", header_str.c_str());
+      fclose(fp);
+    }
    return ss.str();
  }

@@ -194,6 +220,7 @@ class PrecisionProfiler {
      }
 #ifdef LITE_WITH_OPENCL
    } else if (target_type == TARGET(kOpenCL)) {
+      CLRuntime::Global()->command_queue().finish();
      switch (layout_type) {
        case DATALAYOUT(kImageDefault): {
          paddle::lite::CLImageConverterDefault default_convertor;
@@ -360,8 +387,12 @@ class PrecisionProfiler {
        }
      }
    }
+    write_precision_summary_tofile(ss.str(), log_dir_);
    return ss.str();
  }
+
+ private:
+  std::string log_dir_{"/storage/emulated/0/precision.log"};
 };

 }  // namespace profile

--- a/lite/core/scope.cc
+++ b/lite/core/scope.cc
@@ -60,6 +60,29 @@ Variable *Scope::FindLocalVar(const std::string &name) const {
  return nullptr;
 }

+// AttributeVarNames will get persistive attribute names stored in parent scope
+std::vector<std::string> Scope::AttributeVarNames() const {
+  std::vector<std::string> resulted_keys;
+  const Scope *cur_scope = this;
+  while (cur_scope->parent()) {
+    cur_scope = cur_scope->parent();
+    auto keys = cur_scope->LocalVarNames();
+    resulted_keys.insert(resulted_keys.end(), keys.begin(), keys.end());
+  }
+  // remove feed and fetch
+  std::vector<std::string> skiped_vars = {"feed", "fetch"};
+  for (int i = 0; i < skiped_vars.size(); i++) {
+    auto iter =
+        std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    while (iter != resulted_keys.end()) {
+      resulted_keys.erase(iter);
+      iter =
+          std::find(resulted_keys.begin(), resulted_keys.end(), skiped_vars[i]);
+    }
+  }
+  return resulted_keys;
+}
+
 std::vector<std::string> Scope::LocalVarNames() const {
  std::vector<std::string> keys;
  for (const auto &item : vars_) {

--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -45,6 +45,8 @@ class Scope final {

  const Scope* parent() const { return parent_; }

+  // Get attribute params stored in parent scopes.
+  std::vector<std::string> AttributeVarNames() const;
  // Following the legacy scope interface.
  std::vector<std::string> LocalVarNames() const;


--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -54,7 +54,7 @@ git checkout release/v2.3
    --arm_lang=gcc \
    --android_stl=c++_static \
    --build_extra=ON \
-    --shutdown_log=OFF \
+    --with_log=ON \
    full_publish
 ```


--- a/lite/demo/cxx/cuda_demo/CMakeLists.txt
+++ b/lite/demo/cxx/cuda_demo/CMakeLists.txt
-project(demo CXX C)
 cmake_minimum_required(VERSION 2.8)
+project(demo CXX C)
+
+add_definitions(-DLITE_WITH_CUDA)

 set(TARGET demo)
 set(CMAKE_CXX_FLAGS "-std=c++11 -O3")

-set(LITE_LIB "${PROJECT_SOURCE_DIR}/../../cxx")
-set(PROTOBUF_LIB "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")
+set(LITE_ROOT "${PROJECT_SOURCE_DIR}/../../cxx")
+set(PROTOBUF_ROOT "${PROJECT_SOURCE_DIR}/../../third_party/protobuf")

-include_directories("${LITE_LIB}/include")
-link_directories("${LITE_LIB}/lib")
-link_directories("${PROTOBUF_LIB}/lib")
+include_directories("${LITE_ROOT}/include")
+link_directories("${LITE_ROOT}/lib")
+link_directories("${PROTOBUF_ROOT}/lib")
+# cuda lib
+link_directories("/usr/local/cuda/lib64/")

 add_executable(${TARGET} ${TARGET}.cc)

-set(DEPS ${LITE_LIB}/lib/libpaddle_full_api_shared.so)
+set(DEPS ${LITE_ROOT}/lib/libpaddle_full_api_shared.so)
 set(DEPS ${DEPS} protobuf-lite)
-set(DEPS ${DEPS} "-lrt -lpthread -ldl")
+set(DEPS ${DEPS} "-lrt -lpthread -ldl -lcudart")

 target_link_libraries(${TARGET} ${DEPS})
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/test_libs/README.md
+++ b/lite/demo/cxx/test_libs/README.md
+**测试PaddleLite C++预测库**
+
+1、编译full_publish预测库，需要打开build_extra，比如 `./lite/tools/build.sh --arm_os=android --arm_abi=armv8 --arm_lang=gcc --android_stl=c++_static --build_extra=ON full_publish`
+
+2、进入编译产出的目录，比如 `build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/test_libs`，执行 `sh prepare.sh`，得到所有测试文件在 `test_lite_lib_files` 文件中
+
+3、将 `test_lite_lib_files` 文件push到手机上，进入手机端 `test_lite_lib_files` 目录，执行 `sh run.sh`，查看log信息统计测试结果，其中涵盖测试light库、full库、动态库和静态库。
--- a/lite/demo/cxx/test_libs/classification_full.cc
+++ b/lite/demo/cxx/test_libs/classification_full.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
+
+// Optimize model for ARM CPU.
+// If the model is not combined, set model_filename and params_filename as empty
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test.---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test.---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/classification_light.cc
+++ b/lite/demo/cxx/test_libs/classification_light.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(optimized_model_path, "", "the path of optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/prepare.sh
+++ b/lite/demo/cxx/test_libs/prepare.sh
+make clean
+make all -j
+
+gf=test_lite_lib_files
+if [ -d ${gf} ];then
+    rm -rf ${gf}
+fi
+mkdir ${gf}
+
+mv classification_full_shared ${gf}
+mv classification_full_static ${gf}
+mv classification_light_shared ${gf}
+mv classification_light_static ${gf}
+mv yolov3_full_shared ${gf}
+mv yolov3_full_static ${gf}
+mv yolov3_light_shared ${gf}
+mv yolov3_light_static ${gf}
+cp run.sh ${gf}
+
+make clean
+
+cp -r ../../../cxx/ ${gf}
+mv ${gf}/cxx ${gf}/lite
+
+if [ ! -f "test_libs_models_imgs.tgz" ];then
+    wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
+fi
+tar zxvf test_libs_models_imgs.tgz
+mv test_libs_models_imgs ${gf}
+mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
--- a/lite/demo/cxx/test_libs/run.sh
+++ b/lite/demo/cxx/test_libs/run.sh
+export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
+
+# mobilenetv1
+model_name="mobilenetv1"
+input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
+              --out_max_value=0.936887 \
+              --out_max_value_index=65"
+echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
+
+./classification_light_shared ${input_params} \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb
+
+./classification_light_static ${input_params} \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb
+
+./classification_full_shared ${input_params} \
+    --model_dir=models_imgs/models/mobilenetv1
+
+./classification_full_static ${input_params} \
+    --model_dir=models_imgs/models/mobilenetv1
+
+# mobilenetv2
+model_name="mobilenetv2"
+input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
+              --out_max_value=0.868888 \
+              --out_max_value_index=65"
+echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
+
+./classification_light_shared ${input_params} \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb
+
+./classification_light_static ${input_params} \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb
+
+./classification_full_shared ${input_params} \
+    --model_dir=models_imgs/models/mobilenetv2
+
+./classification_full_static ${input_params} \
+    --model_dir=models_imgs/models/mobilenetv2
+
+# shufflenetv2
+model_name="shufflenetv2"
+input_params="--img_txt_path=models_imgs/images/classification.jpg.txt \
+              --out_max_value=0.776729 \
+              --out_max_value_index=65"
+echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
+
+./classification_light_shared ${input_params} \
+    --optimized_model_path=models_imgs/models/shufflenetv2.nb
+
+./classification_light_static ${input_params} \
+    --optimized_model_path=models_imgs/models/shufflenetv2.nb
+
+./classification_full_shared ${input_params} \
+    --model_dir=models_imgs/models/shufflenetv2
+
+./classification_full_static ${input_params} \
+    --model_dir=models_imgs/models/shufflenetv2
+
+# yolov3
+model_name="yolov3"
+input_params="--img_txt_path=models_imgs/images/yolov3.jpg.txt \
+              --out_values=0,0.153605,174.494,199.729,562.075,604.014"
+echo "Test ${model_name}: light_shared, light_static, full_shared, full_static."
+
+./yolov3_light_shared ${input_params} \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb
+
+./yolov3_light_static ${input_params} \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb
+
+./yolov3_full_shared ${input_params} \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1
+
+./yolov3_full_static ${input_params} \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1
--- a/lite/demo/cxx/test_libs/test_helper.cc
+++ b/lite/demo/cxx/test_libs/test_helper.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test_helper.h"  // NOLINT
+
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
+  std::vector<int64_t> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    int num = atoi(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
+  std::vector<double> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    double num = atof(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) / scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) / scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) / scale[2];
+  }
+}
+
+// Process img and set it as input
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dest_data,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, dest_data, width * height, means, scales);
+}
--- a/lite/demo/cxx/test_libs/test_helper.h
+++ b/lite/demo/cxx/test_libs/test_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS();
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape);
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
+std::vector<double> GetDoubleNumsFromStr(const std::string& str);
+
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale);
+
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dst_data,
+                 float* means,
+                 float* scales);
--- a/lite/demo/cxx/test_libs/yolov3_full.cc
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
--- a/lite/demo/cxx/test_libs/yolov3_light.cc
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/CMakeLists.txt
+cmake_minimum_required(VERSION 2.8)
+
+set(TARGET mobilenet_full_api)
+
+# 1. path to Paddle-Lite lib and mklml lib
+set(LITE_DIR "${PROJECT_SOURCE_DIR}/../../../cxx")
+set(MKLML_DIR "${PROJECT_SOURCE_DIR}/../../../third_party/mklml/")
+
+# 2. link mklml and Paddle-Lite directory
+link_directories(${LITE_DIR}/lib ${MKLML_DIR}/lib)
+include_directories(${LITE_DIR}/include/ ${MKLML_DIR}/include)
+
+# 3. compile options 
+add_definitions(-std=c++11 -g -O3 -pthread)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_SOURCE_DIR})
+
+# 4.add executable output
+add_executable(${TARGET} ${TARGET}.cc)
+target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+target_link_libraries(${TARGET} -lmklml_intel)
+target_link_libraries(${TARGET} -ldl)
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/build.sh
+mkdir ./build
+cd ./build
+cmake ..
+make
+cd ..
+rm -rf ./build
--- a/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_full_demo/mobilenet_full_api.cc
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/CMakeLists.txt
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/build.sh
+mkdir ./build
+cd ./build
+cmake ..
+make
+cd ..
+rm -rf ./build
--- a/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc
+++ b/lite/demo/cxx/x86_mobilenetv1_light_demo/mobilenet_light_api.cc
--- a/lite/demo/java/README.md
+++ b/lite/demo/java/README.md
--- a/lite/kernels/apu/CMakeLists.txt
+++ b/lite/kernels/apu/CMakeLists.txt
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/apu/bridges/graph.h
+++ b/lite/kernels/apu/bridges/graph.h
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
--- a/lite/kernels/bm/bridges/dropout_op.cc
+++ b/lite/kernels/bm/bridges/dropout_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
--- a/lite/kernels/bm/bridges/matmul_op.cc
+++ b/lite/kernels/bm/bridges/matmul_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
--- a/lite/kernels/bm/bridges/swish_op.cc
+++ b/lite/kernels/bm/bridges/swish_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
--- a/lite/kernels/opencl/activation_image_compute.cc
+++ b/lite/kernels/opencl/activation_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute.cc
--- a/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
+++ b/lite/kernels/opencl/bilinear_interp_image_compute_test.cc
--- a/lite/kernels/opencl/box_coder_image_compute.cc
+++ b/lite/kernels/opencl/box_coder_image_compute.cc
--- a/lite/kernels/opencl/concat_image_compute.cc
+++ b/lite/kernels/opencl/concat_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
--- a/lite/kernels/opencl/conv_image_compute_test.cc
+++ b/lite/kernels/opencl/conv_image_compute_test.cc
--- a/lite/kernels/opencl/elementwise_add_buffer_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_buffer_compute.cc
--- a/lite/kernels/opencl/elementwise_add_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_image_compute.cc
--- a/lite/kernels/opencl/elementwise_mul_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_image_compute.cc
--- a/lite/kernels/opencl/elementwise_sub_image_compute.cc
+++ b/lite/kernels/opencl/elementwise_sub_image_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
--- a/lite/kernels/opencl/grid_sampler_image_compute.cc
+++ b/lite/kernels/opencl/grid_sampler_image_compute.cc
--- a/lite/kernels/opencl/instance_norm_image_compute.cc
+++ b/lite/kernels/opencl/instance_norm_image_compute.cc
--- a/lite/kernels/opencl/io_copy_buffer_compute.cc
+++ b/lite/kernels/opencl/io_copy_buffer_compute.cc
--- a/lite/kernels/opencl/layout_image_compute.cc
+++ b/lite/kernels/opencl/layout_image_compute.cc
--- a/lite/kernels/opencl/lrn_image_compute.cc
+++ b/lite/kernels/opencl/lrn_image_compute.cc
--- a/lite/kernels/opencl/nearest_interp_image_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_image_compute.cc
--- a/lite/kernels/opencl/pad2d_image_compute.cc
+++ b/lite/kernels/opencl/pad2d_image_compute.cc
--- a/lite/kernels/opencl/pool_image_compute.cc
+++ b/lite/kernels/opencl/pool_image_compute.cc
--- a/lite/kernels/opencl/reshape_image_compute.cc
+++ b/lite/kernels/opencl/reshape_image_compute.cc
--- a/lite/kernels/opencl/slice_image_compute_test.cc
+++ b/lite/kernels/opencl/slice_image_compute_test.cc
--- a/lite/kernels/x86/sequence_reshape_compute.cc
+++ b/lite/kernels/x86/sequence_reshape_compute.cc
--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.cc
--- a/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
+++ b/lite/kernels/xpu/__xpu__embedding_with_eltwise_add_compute.h
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
--- a/lite/kernels/xpu/utils.h
+++ b/lite/kernels/xpu/utils.h
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.h
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.h
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
--- a/lite/operators/__xpu__fc_op.h
+++ b/lite/operators/__xpu__fc_op.h
--- a/lite/operators/max_pool_with_index_op.cc
+++ b/lite/operators/max_pool_with_index_op.cc
--- a/lite/operators/max_pool_with_index_op.h
+++ b/lite/operators/max_pool_with_index_op.h
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/scale_op.cc
+++ b/lite/operators/scale_op.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/build.bat
+++ b/build.bat
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
--- a/lite/tools/build_linux.sh
+++ b/lite/tools/build_linux.sh
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
--- a/lite/tools/build_rknpu.sh
+++ b/lite/tools/build_rknpu.sh
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/tools/untar.py
+++ b/lite/tools/untar.py
--- a/lite/utils/CMakeLists.txt
+++ b/lite/utils/CMakeLists.txt
--- a/lite/utils/cp_logging.h
+++ b/lite/utils/cp_logging.h
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
--- a/mobile/test/net/test_inference_imfix.cpp
+++ b/mobile/test/net/test_inference_imfix.cpp
--- a/mobile/tools/build_android_armv7.sh
+++ b/mobile/tools/build_android_armv7.sh
--- a/mobile/tools/build_android_armv8.sh
+++ b/mobile/tools/build_android_armv8.sh
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake