Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into pp_yolo_support

d784eb9e · zhangwen31 · f8c8538f · 2d7e4d47 · d784eb9e · d784eb9e
1000 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,12 +16,6 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(lite_utils)

-lite_option(WITH_PADDLE_MOBILE   "Use the paddle-mobile legacy build"    OFF)
-if (WITH_PADDLE_MOBILE)
-    add_subdirectory(mobile)
-    return()
-endif(WITH_PADDLE_MOBILE)
-
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)

--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 - [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
 - [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
 - [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
- [CUDA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/cuda.html)
 - [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
 - [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
 - [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
@@ -77,7 +76,6 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 | CPU(32bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | CPU(64bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | OpenCL | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
-| CUDA | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
 | FPGA | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
 | 华为NPU | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
 | 百度 XPU | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -199,13 +199,10 @@ if (LITE_WITH_EXCEPTION)
  add_definitions("-DLITE_WITH_EXCEPTION")
 endif()

-if (LITE_ON_FLATBUFFERS_DESC_VIEW)
-  add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
-  message(STATUS "Flatbuffers will be used as cpp default program description.")
-endif()
-
 if (LITE_ON_TINY_PUBLISH)
  add_definitions("-DLITE_ON_TINY_PUBLISH")
+  add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
+  message(STATUS "Flatbuffers will be used as cpp default program description.")
 else()
  add_definitions("-DLITE_WITH_FLATBUFFERS_DESC")
 endif()

--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
  return()
 endif()

+# require -D_GLIBCXX_USE_CXX11_ABI=0 if GCC 7.3.0
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 # 1. path to Huawei Ascend Install Path
 if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
    set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})

--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
@@ -27,7 +27,7 @@ SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
 SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
 SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
 IF(WIN32)
-  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
 ELSE(WIN32)
  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@@ -64,13 +64,6 @@ ExternalProject_Add(
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-IF(WIN32)
-  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
-    add_custom_command(TARGET extern_flatbuffers POST_BUILD
-            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
-            )
-  ENDIF()
-ENDIF(WIN32)
 ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
 ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
    ENDIF()

+    IF(LITE_WITH_HUAWEI_ASCEND_NPU)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
+    ENDIF()
+
    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
        ExternalProject_Add(
            ${TARGET_NAME}

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,6 +267,10 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
+      if("${cc_library_DEPS};" MATCHES "fbs_headers;")
+        list(REMOVE_ITEM cc_library_DEPS fbs_headers)
+        add_dependencies(${TARGET_NAME} fbs_headers)
+      endif()
      # Only deps libmklml.so, not link
      if("${cc_library_DEPS};" MATCHES "mklml;")
        list(REMOVE_ITEM cc_library_DEPS mklml)

--- a/docs/api_reference/cv.md
+++ b/docs/api_reference/cv.md
@@ -91,13 +91,23 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
    // 方法二
    void ImagePreprocess::imageCovert(const uint8_t* src,
    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    // 方法三
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
+    int srcw, int srch);
    ```

    + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
        - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
        - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
+    
+    - 第二个`imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量

-    - 第二个`imageCovert` 接口，可以直接使用
+    - 第二个`imageCovert` 接口, 可以直接使用
    
 ### 缩放 Resize


--- a/docs/demo_guides/baidu_xpu.md
+++ b/docs/demo_guides/baidu_xpu.md
@@ -16,69 +16,12 @@ Paddle Lite已支持百度XPU在x86和arm服务器（例如飞腾 FT-2000+/64）

 ### 已支持的Paddle模型

- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
- YOLOv3
- Mask R-CNN
- Faster R-CNN
- UNet
- SENet
- SSD
+- [开源模型支持列表](../introduction/support_model_list)
 - 百度内部业务模型（由于涉密，不方便透露具体细节）

 ### 已支持（或部分支持）的Paddle算子（Kernel接入方式）

- scale
- relu
- tanh
- sigmoid
- stack
- matmul
- pool2d
- slice
- lookup_table
- elementwise_add
- elementwise_sub
- cast
- batch_norm
- mul
- layer_norm
- softmax
- conv2d
- io_copy
- io_copy_once
- __xpu__fc
- __xpu__multi_encoder
- __xpu__resnet50
- __xpu__embedding_with_eltwise_add
-
-### 已支持（或部分支持）的Paddle算子（子图/XTCL接入方式）
-
- relu
- tanh
- conv2d
- depthwise_conv2d
- elementwise_add
- pool2d
- softmax
- mul
- batch_norm
- stack
- gather
- scale
- lookup_table
- slice
- transpose
- transpose2
- reshape
- reshape2
- layer_norm
- gelu
- dropout
- matmul
- cast
- yolo_box
+- [算子支持列表](../introduction/support_operation_list)


 ## 参考示例演示
@@ -233,7 +176,7 @@ $ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build
 ```

 - 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录；
- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。


--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
 # PaddleLite使用CUDA预测部署

+**注意**: Lite CUDA仅作为Nvidia GPU加速库，支持模型有限，如有需要请使用[PaddleInference](https://paddle-inference.readthedocs.io/en/latest)。
+
 Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。

 ## 编译

--- a/docs/images/architecture.png
+++ b/docs/images/architecture.png
--- a/docs/images/workflow.png
+++ b/docs/images/workflow.png
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -57,7 +57,6 @@ Welcome to Paddle-Lite's documentation!
  demo_guides/ios_app_demo
  demo_guides/linux_arm_demo
  demo_guides/x86
-  demo_guides/cuda
  demo_guides/opencl
  demo_guides/fpga
  demo_guides/huawei_kirin_npu

--- a/docs/introduction/architecture.md
+++ b/docs/introduction/architecture.md
@@ -5,23 +5,25 @@ Mobile 在这次升级为 Lite 架构， 侧重多硬件、高性能的支持，
 - 引入 Type system，强化多硬件、量化方法、data layout 的混合调度能力
 - 硬件细节隔离，通过不同编译开关，对支持的任何硬件可以自由插拔
 - 引入 MIR(Machine IR) 的概念，强化带执行环境下的优化支持
- 优化期和执行期严格隔离，保证预测时轻量和高效率
+- 图优化模块和执行引擎实现了良好的解耦拆分，保证预测执行阶段的轻量和高效率

 架构图如下

-![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
+<p align="center"><img width="500" src="https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/architecture.png"/></p>

-## 编译期和执行期严格隔离设计
+## 模型优化阶段和预测执行阶段的隔离设计

- compile time 优化完毕可以将优化信息存储到模型中；execution time 载入并执行
- 两套 API 及对应的预测lib，满足不同场景
-  - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`，可以 runtime 在具体硬件上做分析和优化，得到最优效果
-  - `MobilePredictor` 只打包 `Execution Time`，保持部署和执行的轻量
+- Analysis Phase为模型优化阶段，输入为Paddle的推理模型，通过Lite的模型加速和优化策略对计算图进行相关的优化分析，包含算子融合，计算裁剪，存储优化，量化精度转换、存储优化、Kernel优选等多类图优化手段。优化后的模型更轻量级，在相应的硬件上运行时耗费资源更少，并且执行速度也更快。
+- Execution Phase为预测执行阶段，输入为优化后的Lite模型，仅做模型加载和预测执行两步操作，支持极致的轻量级部署，无任何第三方依赖。

-## `Execution Time` 轻量级设计和实现
+Lite设计了两套 API 及对应的预测库，满足不同场景需求：
+  - `CxxPredictor` 同时包含 `Analysis Phase` 和 `Execution Phase`，支持一站式的预测任务，同时支持模型进行分析优化与预测执行任务，适用于对预测库大小不敏感的硬件场景。
+  - `MobilePredictor` 只包含 `Execution Phase`，保持预测部署和执行的轻量级和高性能，支持从内存或者文件中加载优化后的模型，并进行预测执行。

- 每个 batch 实际执行只包含两个步骤执行
-  - `Op.InferShape`
+## Execution Phase轻量级设计和实现
+
+- 在预测执行阶段，每个 batch 实际执行只包含两个步骤执行
+  - `OpLite.InferShape` 基于输入推断得到输出的维度
  - `Kernel.Run`，Kernel 相关参数均使用指针提前确定，后续无查找或传参消耗
  - 设计目标，执行时，只有 kernel 计算本身消耗
 - 轻量级 `Op` 及 `Kernel` 设计，避免框架额外消耗

--- a/docs/introduction/support_hardware.md
+++ b/docs/introduction/support_hardware.md
@@ -29,7 +29,8 @@ Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM
 Paddle Lite支持移动端GPU和Nvidia端上GPU设备，支持列表如下：
 - ARM Mali G 系列
 - Qualcomm Adreno 系列
- Nvida tegra系列: tx1, tx2, nano, xavier
+  
+  Nvida tegra系列: tx1, tx2, nano, xavier

 ## NPU
 Paddle Lite支持NPU，支持列表如下：

--- a/docs/introduction/support_model_list.md
+++ b/docs/introduction/support_model_list.md
 # 支持模型

-目前已严格验证24个模型的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持，并在不断丰富中。
+目前已严格验证28个模型的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持，并在不断丰富中。

-| 类别 | 类别细分 | 模型 | 支持Int8 | 支持平台 |
-|-|-|:-:|:-:|-:|
-| CV  | 分类 | mobilenetv1 | Y | ARM，X86，NPU，RKNPU，APU |
-| CV  | 分类 | mobilenetv2 | Y | ARM，X86，NPU |
-| CV  | 分类 | resnet18 | Y | ARM，NPU |
-| CV  | 分类 | resnet50 | Y | ARM，X86，NPU，XPU |
-| CV  | 分类 | mnasnet |  | ARM，NPU |
-| CV  | 分类 | efficientnet |  | ARM |
-| CV  | 分类 | squeezenetv1.1 |  | ARM，NPU |
-| CV  | 分类 | ShufflenetV2 | Y | ARM |
-| CV  | 分类 | shufflenet | Y | ARM |
-| CV  | 分类 | inceptionv4 | Y | ARM，X86，NPU |
-| CV  | 分类 | vgg16 | Y | ARM |
-| CV  | 分类 | googlenet | Y  | ARM，X86 |
-| CV  | 检测 | mobilenet_ssd | Y | ARM，NPU* |
-| CV  | 检测 | mobilenet_yolov3 | Y | ARM，NPU* |
-| CV | 检测 | Faster RCNN |  | ARM |
-| CV | 检测 | Mask RCNN |  | ARM |
-| CV | 分割 | Deeplabv3 | Y | ARM |
-| CV  | 分割 | unet |  | ARM |
-| CV  | 人脸 | facedetection |  | ARM |
-| CV  | 人脸 | facebox |  | ARM |
-| CV  | 人脸 | blazeface | Y | ARM |
-| CV  | 人脸 | mtcnn |  | ARM |
-| CV  | OCR | ocr_attention |  | ARM |
-| NLP  | 机器翻译 | transformer |  | ARM，NPU* |
+| 类别 | 类别细分 | 模型 | 支持平台 |
+|-|-|:-|:-|
+| CV | 分类 | [MobileNetV1](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz) | ARM，X86，NPU，RKNPU，APU |
+| CV | 分类 | [MobileNetV2](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v2_fp32_224_fluid.tar.gz) | ARM，X86，NPU |
+| CV | 分类 | [ResNet18](https://paddlelite-demo.bj.bcebos.com/models/resnet18_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz) | ARM，X86，NPU，XPU |
+| CV | 分类 | [MnasNet](https://paddlelite-demo.bj.bcebos.com/models/mnasnet_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [EfficientNet*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
+| CV | 分类 | [SqueezeNet](https://paddlelite-demo.bj.bcebos.com/models/squeezenet_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [ShufflenetV2*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
+| CV | 分类 | [ShuffleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/shufflenet_inference.tar.gz) | ARM |
+| CV | 分类 | [InceptionV4](https://paddle-inference-dist.bj.bcebos.com/inception_v4_simple.tar.gz) | ARM，X86，NPU |
+| CV | 分类 | [VGG16](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG16_inference.tar) | ARM |
+| CV | 分类 | [VGG19](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG19_inference.tar) | XPU|
+| CV | 分类 | [GoogleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/GoogleNet_inference.tar) | ARM，X86，XPU |
+| CV | 检测 | [MobileNet-SSD](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) | ARM，NPU* |
+| CV | 检测 | [YOLOv3-MobileNetV3](https://paddlelite-demo.bj.bcebos.com/models/yolov3_mobilenet_v3_prune86_FPGM_320_fp32_fluid.tar.gz) | ARM，NPU* |
+| CV | 检测 | [Faster RCNN](https://paddlepaddle-inference-banchmark.bj.bcebos.com/faster_rcnn.tar) | ARM |
+| CV | 检测 | [Mask RCNN*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/MODEL_ZOO_cn.md) | ARM |
+| CV | 分割 | [Deeplabv3](https://paddlelite-demo.bj.bcebos.com/models/deeplab_mobilenet_fp32_fluid.tar.gz) | ARM |
+| CV | 分割 | [UNet](https://paddlelite-demo.bj.bcebos.com/models/Unet.zip) | ARM |
+| CV | 人脸 | [FaceDetection](https://paddlelite-demo.bj.bcebos.com/models/facedetection_fp32_240_430_fluid.tar.gz) | ARM |
+| CV | 人脸 | [FaceBoxes*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#FaceBoxes) | ARM |
+| CV | 人脸 | [BlazeFace*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#BlazeFace) | ARM |
+| CV | 人脸 | [MTCNN](https://paddlelite-demo.bj.bcebos.com/models/mtcnn.zip) | ARM |
+| CV | OCR | [OCR-Attention](https://paddle-inference-dist.bj.bcebos.com/ocr_attention.tar.gz) | ARM |
+| CV | GAN | [CycleGAN*](https://github.com/PaddlePaddle/models/tree/release/1.7/PaddleCV/gan/cycle_gan) | NPU |
+| NLP | 机器翻译 | [Transformer*](https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/machine_translation/transformer) | ARM，NPU* |
+| NLP | 机器翻译 | [BERT](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/bert.tar.gz) | XPU |
+| NLP | 语义表示 | [ERNIE](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/ernie.tar.gz) | XPU |

-> **注意：** NPU* 代表ARM+NPU异构计算
+**注意：** 
+1. 模型列表中 * 代表该模型链接来自[PaddlePaddle/models](https://github.com/PaddlePaddle/models)，否则为推理模型的下载链接
+2. 支持平台列表中 NPU* 代表ARM+NPU异构计算，否则为NPU计算
--- a/docs/quick_start/release_lib.md
+++ b/docs/quick_start/release_lib.md
@@ -76,7 +76,6 @@ pip install paddlelite
 - [ArmLinux源码编译](../source_compile/compile_linux)
 - [x86源码编译](../demo_guides/x86)
 - [opencl源码编译](../demo_guides/opencl)
- [CUDA源码编译](../demo_guides/cuda)
 - [FPGA源码编译](../demo_guides/fpga)
 - [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
 - [百度XPU源码编译](../demo_guides/baidu_xpu)

--- a/docs/quick_start/tutorial.md
+++ b/docs/quick_start/tutorial.md
@@ -2,51 +2,63 @@

 Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架，它可以支持诸如ARM、OpenCL、NPU等等多种终端，同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中，那么只需要如下几步简单操作即可。

-## 一. 准备模型

-Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此，在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。
-如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。
+![workflow](https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/workflow.png)

-## 二. 模型优化
+**一. 准备模型**

-Lite框架拥有强大的加速、优化策略及实现，其中包含诸如量化、子图融合、Kernel优选等等优化手段，为了方便您使用这些优化策略，我们提供了[opt](../user_guides/model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+Paddle Lite框架直接支持模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。目前PaddlePaddle用于推理的模型是通过[save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/io_cn/save_inference_model_cn.html#save-inference-model)这个API保存下来的。
+如果您手中的模型是由诸如Caffe、Tensorflow、PyTorch等框架产出的，那么您可以使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具将模型转换为PadddlePaddle格式。

-opt的详细介绍，请您参考 [模型优化方法](../user_guides/model_optimize_tool)。
+**二. 模型优化**

-下载opt工具后执行以下代码：
+Paddle Lite框架拥有优秀的加速、优化策略及实现，包含量化、子图融合、Kernel优选等优化手段。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+这些优化通过Paddle Lite提供的opt工具实现。opt工具还可以统计并打印出模型中的算子信息，并判断不同硬件平台下Paddle Lite的支持情况。您获取PaddlePaddle格式的模型之后，一般需要通该opt工具做模型优化。opt工具的下载和使用，请参考 [模型优化方法](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html)。

-``` shell
-$ ./opt \
-    --model_dir=<model_param_dir> \
-    --model_file=<model_path> \
-    --param_file=<param_path> \
-    --optimize_out_type=(protobuf|naive_buffer) \
-    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86)
-```
+**注意**: 为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型。

-其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。
+**三. 下载或编译**

-## 三. 使用Lite框架执行预测
+Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载，我们优先推荐您直接下载 [Paddle Lite预编译库](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html)。
+您也可以根据目标平台选择对应的[源码编译方法](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2)。Paddle Lite 提供了源码编译脚本，位于 `lite/tools/`文件夹下，只需要 [准备环境](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html) 和 [调用编译脚本](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2) 两个步骤即可一键编译得到目标平台的Paddle Lite预测库。

-在上一节中，我们已经通过`opt`获取到了优化后的模型，使用优化模型进行预测也十分的简单。为了方便您的使用，Lite进行了良好的API设计，隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测（以C++ API进行说明）：
+**四. 开发应用程序**

+Paddle Lite提供了C++、Java、Python三种API，只需简单五步即可完成预测（以C++ API为例）：

-1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径，如 `config.set_model_from_file(FLAGS_model_file)` ；从memory加载模型方法现只支持加载优化后模型的naive buffer，实现方法为：
-`void set_model_from_buffer(model_buffer) `
+1. 声明`MobileConfig`，设置第二步优化后的模型文件路径，或选择从内存中加载模型
+2. 创建`Predictor`，调用`CreatePaddlePredictor`接口，一行代码即可完成引擎初始化
+3. 准备输入，通过`predictor->GetInput(i)`获取输入变量，并为其指定输入大小和输入值
+4. 执行预测，只需要运行`predictor->Run()`一行代码，即可使用Lite框架完成预测执行
+5. 获得输出，使用`predictor->GetOutput(i)`获取输出变量，并通过`data<T>`取得输出值

-2. 创建Predictor。Predictor即为Lite框架的预测引擎，为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口，你只需要简单的执行一行代码即可完成预测引擎的初始化，`std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)` 。
-3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field，同样的，如果您的模型有多个输入，那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小，并填入输入值。
-4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。
-5. 获取输出。与输入类似，您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度，通过 `data<T>()` 模板方法获取其输出值。
+Paddle Lite提供了C++、Java、Python三种API的完整使用示例和开发说明文档，您可以参考示例中的说明快速了解使用方法，并集成到您自己的项目中去。

+- [C++完整示例](cpp_demo.html)
+- [Java完整示例](java_demo.html)
+- [Python完整示例](python_demo.html)

+针对不同的硬件平台，Paddle Lite提供了各个平台的完整示例：

+- [Android示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/android_app_demo.html)
+- [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
+- [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
+- [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
+- [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
+- [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
+- [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
+- [百度XPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/baidu_xpu.html)
+- [瑞芯微NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html)
+- [联发科APU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/mediatek_apu.html)

-## 四. Lite API
+您也可以下载以下基于Paddle-Lite开发的预测APK程序，安装到Andriod平台上，先睹为快：

-为了方便您的使用，我们提供了C++、Java、Python三种API，并且提供了相应的api的完整使用示例:[C++完整示例](cpp_demo)、[Java完整示例](java_demo)、[Python完整示例](python_demo)，您可以参考示例中的说明快速了解C++/Java/Python的API使用方法，并集成到您自己的项目中去。需要说明的是，为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型，具体方法可参考第2节`模型优化`。
+- [图像分类](https://paddlelite-demo.bj.bcebos.com/apps/android/mobilenet_classification_demo.apk)  
+- [目标检测](https://paddlelite-demo.bj.bcebos.com/apps/android/yolo_detection_demo.apk) 
+- [口罩检测](https://paddlelite-demo.bj.bcebos.com/apps/android/mask_detection_demo.apk)  
+- [人脸关键点](https://paddlelite-demo.bj.bcebos.com/apps/android/face_keypoints_detection_demo.apk) 
+- [人像分割](https://paddlelite-demo.bj.bcebos.com/apps/android/human_segmentation_demo.apk)

-## 五. 测试工具
+## 更多测试工具

 为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](../user_guides/debug) 和 [Profile工具](../user_guides/debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](../user_guides/debug) 了解更多内容。
--- a/docs/source_compile/compile_env.md
+++ b/docs/source_compile/compile_env.md
@@ -19,7 +19,6 @@ Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载，如
 - [ArmLinux源码编译](../source_compile/compile_linux)
 - [X86源码编译](../demo_guides/x86)
 - [OpenCL源码编译](../demo_guides/opencl)
- [CUDA源码编译](../demo_guides/cuda)
 - [FPGA源码编译](../demo_guides/fpga)
 - [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
 - [百度XPU源码编译](../demo_guides/baidu_xpu)

--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -41,6 +41,7 @@ if (WITH_TESTING)
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
 	    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+	    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1_int16.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
@@ -51,11 +52,19 @@ if (WITH_TESTING)
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
+
+        set(LITE_URL_FOR_UNITTESTS "http://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests")
+        # models
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "resnet50.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ernie.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "GoogLeNet.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
+        # data
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
    endif()
 endif()


--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -15,7 +15,6 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
    #full api dynamic library
    lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                  DEPS paddle_api paddle_api_light  paddle_api_full)
-    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry fbs_headers)
    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
    if(LITE_WITH_X86)
@@ -70,6 +69,10 @@ else()
        set(TARGET_COMIPILE_FLAGS "-fdata-sections")
        if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
            set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
+            # TODO (hong19860320): Disable lto temporarily since it causes fail to catch the exceptions in android when toolchain is gcc.
+            if (ARM_TARGET_OS STREQUAL "android" AND LITE_WITH_EXCEPTION)
+                set(TARGET_COMIPILE_FLAGS "")
+            endif()
        endif()
        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h fbs_headers)
@@ -289,6 +292,14 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
       set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
    endif()
    
+    lite_cc_test(test_mobilenetv1_int16 SRCS mobilenetv1_int16_test.cc
+       DEPS ${lite_model_test_DEPS} ${light_lib_DEPS}
+       CL_DEPS ${opencl_kernels}
+       NPU_DEPS ${npu_kernels} ${npu_bridges}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
+            --model_dir=${LITE_MODEL_DIR}/mobilenet_v1_int16 SERIAL)
+    add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_int16_tar_gz)
+
    lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
       DEPS ${lite_model_test_DEPS}
       CL_DEPS ${opencl_kernels}

--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,7 +17,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
    # Unlike static library, module library has to link target to be able to work
    # as a single .so lib.
    target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
-    add_dependencies(paddle_lite_jni fbs_headers)
    if (LITE_WITH_NPU)
        # Strips the symbols of our protobuf functions to fix the conflicts during
        # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -30,8 +30,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -58,6 +58,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
                                          config.mlu_input_layout(),
                                          config.mlu_firstconv_param());
 #endif  // LITE_WITH_MLU
+
+#ifdef LITE_WITH_BM
+    Env<TARGET(kBM)>::Init();
+    int device_id = 0;
+    if (const char *c_id = getenv("BM_VISIBLE_DEVICES")) {
+      device_id = static_cast<int>(*c_id) - 48;
+    }
+    TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
+#endif  // LITE_WITH_BM
+
    auto use_layout_preprocess_pass =
        config.model_dir().find("OPENCL_PRE_PRECESS");
    VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
@@ -86,7 +96,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
      config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
  int num_threads = config.x86_math_library_num_threads();
  int real_num_threads = num_threads > 1 ? num_threads : 1;
  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);

--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -131,7 +131,8 @@ TEST(CXXApi, save_model) {
  predictor.Build(FLAGS_model_dir, "", "", valid_places);

  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-  predictor.SaveModel(FLAGS_optimized_model);
+  predictor.SaveModel(FLAGS_optimized_model,
+                      lite_api::LiteModelType::kProtobuf);
  predictor.SaveModel(FLAGS_optimized_model + ".naive",
                      lite_api::LiteModelType::kNaiveBuffer);
 }

--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -46,7 +46,6 @@ void LightPredictor::Build(const std::string& model_dir,
    case lite_api::LiteModelType::kProtobuf:
      LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
      break;
-#endif
    case lite_api::LiteModelType::kNaiveBuffer: {
      if (model_from_memory) {
        LoadModelNaiveFromMemory(
@@ -56,6 +55,7 @@ void LightPredictor::Build(const std::string& model_dir,
      }
      break;
    }
+#endif
    default:
      LOG(FATAL) << "Unknown model type";
  }

--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -17,6 +17,10 @@
 #include "lite/api/paddle_api.h"
 #include "lite/core/version.h"
 #include "lite/model_parser/model_parser.h"
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#endif

 namespace paddle {
 namespace lite {

--- a/lite/api/mobilenetv1_int16_test.cc
+++ b/lite/api/mobilenetv1_int16_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/light_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(optimized_model,
+              "/data/local/tmp/int16_model",
+              "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places,
+               const std::string& model_dir) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_NO_BIND, FLAGS_threads);
+
+  LOG(INFO) << "Optimize model.";
+  lite::Predictor cxx_predictor;
+  cxx_predictor.Build(model_dir, "", "", valid_places);
+  cxx_predictor.SaveModel(FLAGS_optimized_model,
+                          paddle::lite_api::LiteModelType::kNaiveBuffer);
+
+  LOG(INFO) << "Load optimized model.";
+  lite::LightPredictor predictor(FLAGS_optimized_model + ".nb", false);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = FLAGS_N * FLAGS_C * FLAGS_H * FLAGS_W;
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1.;
+  }
+
+  LOG(INFO) << "Predictor run.";
+  predictor.Run();
+
+  auto* out = predictor.GetOutput(0);
+  const auto* pdata = out->data<float>();
+
+  std::vector<float> ref = {
+      0.000191383, 0.000592063, 0.000112282, 6.27426e-05, 0.000127522};
+  double eps = 1e-5;
+  for (int i = 0; i < ref.size(); ++i) {
+    EXPECT_NEAR(pdata[i], ref[i], eps);
+  }
+}
+
+TEST(MobileNetV1_Int16, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  std::string model_dir = FLAGS_model_dir;
+  TestModel(valid_places, model_dir);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -25,8 +25,6 @@
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 #include <gflags/gflags.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"

 using paddle::lite::profile::Timer;


--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer,
  model_from_memory_ = true;
 }

+// This is the method for allocating workspace_size according to L3Cache size
+void MobileConfig::SetArmL3CacheSize(L3CacheSetMethod method,
+                                     int absolute_val) {
+#ifdef LITE_WITH_ARM
+  lite::DeviceInfo::Global().SetArmL3CacheSize(method, absolute_val);
+#endif
+}
+
 }  // namespace lite_api
 }  // namespace paddle
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>;
 using lod_t = std::vector<std::vector<uint64_t>>;

 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
+// Methods for allocating L3Cache on Arm platform
+enum class L3CacheSetMethod {
+  kDeviceL3Cache = 0,  // Use the system L3 Cache size, best performance.
+  kDeviceL2Cache = 1,  // Use the system L2 Cache size, trade off performance
+                       // with less memory consumption.
+  kAbsolute = 2,       // Use the external setting.
+  // kAutoGrow = 3,   // Not supported yet, least memory consumption.
+};

 // return true if current device supports OpenCL model
 LITE_API bool IsOpenCLBackendValid();
@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase {

  // NOTE: This is a deprecated API and will be removed in latter release.
  const std::string& param_buffer() const { return param_buffer_; }
+
+  // This is the method for allocating workspace_size according to L3Cache size
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1);
 };

 template <typename ConfigT>

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,6 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"

@@ -109,7 +107,8 @@ TEST(CxxApi, share_external_data) {
 TEST(LightApi, run) {
  lite_api::MobileConfig config;
  config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
-
+  // disable L3 cache on workspace_ allocating
+  config.SetArmL3CacheSize(L3CacheSetMethod::kDeviceL2Cache);
  auto predictor = lite_api::CreatePaddlePredictor(config);

  auto inputs = predictor->GetInputNames();
@@ -150,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) {
  // set model buffer and run model
  lite_api::MobileConfig config;
  config.set_model_from_buffer(model_buffer);
+  // allocate 1M initial space for workspace_
+  config.SetArmL3CacheSize(L3CacheSetMethod::kAbsolute, 1024 * 1024);

  auto predictor = lite_api::CreatePaddlePredictor(config);
  auto input_tensor = predictor->GetInput(0);

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -62,6 +62,7 @@ USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_d_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);

--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -9,7 +9,7 @@ if(WIN32)
   target_link_libraries(lite_pybind ${os_dependency_modules})
 else()
   lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
-   target_sources(lite_pybind PUBLIC ${__lite_cc_files})
+   target_sources(lite_pybind PUBLIC ${__lite_cc_files} fbs_headers)
 endif(WIN32)

 if (LITE_ON_TINY_PUBLISH)

--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
  PADDLE_DLSYM(NeuronModel_setOperandValue);
  PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
  PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_addOperationExtension);
  PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
  PADDLE_DLSYM(NeuronCompilation_create);
  PADDLE_DLSYM(NeuronCompilation_free);
  PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_createForDevices);
  PADDLE_DLSYM(NeuronExecution_create);
  PADDLE_DLSYM(NeuronExecution_free);
  PADDLE_DLSYM(NeuronExecution_setInput);
  PADDLE_DLSYM(NeuronExecution_setOutput);
  PADDLE_DLSYM(NeuronExecution_compute);
-
+  PADDLE_DLSYM(Neuron_getDeviceCount);
+  PADDLE_DLSYM(Neuron_getDevice);
+  PADDLE_DLSYM(NeuronDevice_getName);
 #undef PADDLE_DLSYM
 }

@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
      model, type, inputCount, inputs, outputCount, outputs);
 }

+int NeuronModel_addOperationExtension(NeuronModel* model,
+                                      const char* name,
+                                      const char* vendor,
+                                      const NeuronDevice* device,
+                                      uint32_t inputCount,
+                                      const uint32_t* inputs,
+                                      uint32_t outputCount,
+                                      const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_addOperationExtension()(model,
+                                            name,
+                                            vendor,
+                                            device,
+                                            inputCount,
+                                            inputs,
+                                            outputCount,
+                                            outputs);
+}
+
 int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
                                         uint32_t inputCount,
                                         const uint32_t* inputs,
@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
      compilation);
 }

+int NeuronCompilation_createForDevices(NeuronModel* model,
+                                       const NeuronDevice* const* devices,
+                                       uint32_t numDevices,
+                                       NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_createForDevices()(
+          model, devices, numDevices, compilation);
+}
+
 int NeuronExecution_create(NeuronCompilation* compilation,
                           NeuronExecution** execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
  return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
      execution);
 }
+
+int Neuron_getDeviceCount(uint32_t* numDevices) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
+      numDevices);
+}
+
+int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
+                                                                   device);
+}
+
+int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
+                                                                       name);
+}
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -42,12 +42,25 @@ class NeuronAdapter final {
                                                const uint32_t *,
                                                uint32_t,
                                                const uint32_t *);
+  using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
+                                                         const char *,
+                                                         const char *,
+                                                         const NeuronDevice *,
+                                                         uint32_t,
+                                                         const uint32_t *,
+                                                         uint32_t,
+                                                         const uint32_t *);
  using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
      NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
  using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                NeuronCompilation **);
  using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
  using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_createForDevices_Type =
+      int (*)(NeuronModel *,
+              const NeuronDevice *const *,
+              uint32_t,
+              NeuronCompilation **);
  using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                              NeuronExecution **);
  using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -59,6 +72,10 @@ class NeuronAdapter final {
  using NeuronExecution_setOutput_Type = int (*)(
      NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
  using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+  using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
+  using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
+  using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
+                                            const char **);

  Neuron_getVersion_Type Neuron_getVersion() {
    CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
@@ -105,6 +122,12 @@ class NeuronAdapter final {
    return NeuronModel_addOperation_;
  }

+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
+    CHECK(NeuronModel_addOperationExtension_ != nullptr)
+        << "Cannot load NeuronModel_addOperationExtension!";
+    return NeuronModel_addOperationExtension_;
+  }
+
  NeuronModel_identifyInputsAndOutputs_Type
  NeuronModel_identifyInputsAndOutputs() {
    CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
@@ -130,6 +153,12 @@ class NeuronAdapter final {
    return NeuronCompilation_finish_;
  }

+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
+    CHECK(NeuronCompilation_createForDevices_ != nullptr)
+        << "Cannot load NeuronCompilation_createForDevices!";
+    return NeuronCompilation_createForDevices_;
+  }
+
  NeuronExecution_create_Type NeuronExecution_create() {
    CHECK(NeuronExecution_create_ != nullptr)
        << "Cannot load NeuronExecution_create!";
@@ -160,6 +189,23 @@ class NeuronAdapter final {
    return NeuronExecution_compute_;
  }

+  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
+    CHECK(Neuron_getDeviceCount_ != nullptr)
+        << "Cannot load Neuron_getDeviceCount!";
+    return Neuron_getDeviceCount_;
+  }
+
+  Neuron_getDevice_Type Neuron_getDevice() {
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    return Neuron_getDevice_;
+  }
+
+  NeuronDevice_getName_Type NeuronDevice_getName() {
+    CHECK(NeuronDevice_getName_ != nullptr)
+        << "Cannot load NeuronDevice_getName!";
+    return NeuronDevice_getName_;
+  }
+
 private:
  NeuronAdapter();
  NeuronAdapter(const NeuronAdapter &) = delete;
@@ -176,16 +222,23 @@ class NeuronAdapter final {
  NeuronModel_setOperandSymmPerChannelQuantParams_Type
      NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
  NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
+      nullptr};
  NeuronModel_identifyInputsAndOutputs_Type
      NeuronModel_identifyInputsAndOutputs_{nullptr};
  NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
  NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
  NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
+      nullptr};
  NeuronExecution_create_Type NeuronExecution_create_{nullptr};
  NeuronExecution_free_Type NeuronExecution_free_{nullptr};
  NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
  NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
  NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
+  Neuron_getDevice_Type Neuron_getDevice_{nullptr};
+  NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
 };
 }  // namespace lite
 }  // namespace paddle
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -127,8 +127,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
      anchor_generator.cc
      split_merge_lod_tenosr.cc
      reduce_prod.cc
+      reduce_sum.cc
      lstm.cc
      clip.cc
      pixel_shuffle.cc
+      scatter.cc
      DEPS ${lite_kernel_deps} context tensor)
 endif()
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -620,8 +620,10 @@ void conv_depthwise_3x3_fp32(const void* din,
  int pad = pad_w;
  bool flag_bias = param.bias != nullptr;
  bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
+  bool ch_four = ch_in <= 4 * w_in;
  if (stride == 1) {
-    if (pads_less && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+    if (ch_four && pads_less && (pad_h == pad_w) &&
+        (pad < 2)) {  // support pad = [0, 1]
      conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,
@@ -638,7 +640,6 @@ void conv_depthwise_3x3_fp32(const void* din,
                                act_param,
                                ctx);
    } else {
-#ifdef __aarch64__
      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,
@@ -653,30 +654,10 @@ void conv_depthwise_3x3_fp32(const void* din,
                                param,
                                act_param,
                                ctx);
-#else
-#ifdef LITE_WITH_ARM_CLANG
-      LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
-                    "this can run in basic";
-#else
-      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
-                                reinterpret_cast<float*>(dout),
-                                num,
-                                ch_out,
-                                h_out,
-                                w_out,
-                                ch_in,
-                                h_in,
-                                w_in,
-                                reinterpret_cast<const float*>(weights),
-                                bias,
-                                param,
-                                act_param,
-                                ctx);
-#endif
-#endif
    }
  } else if (stride == 2) {
-    if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+    if (ch_four && pads_less && pad_h == pad_w &&
+        (pad < 2)) {  // support pad = [0, 1]
      conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
                                reinterpret_cast<float*>(dout),
                                num,

--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -53,7 +53,9 @@
 #include "lite/backends/arm/math/reduce_max.h"
 #include "lite/backends/arm/math/reduce_mean.h"
 #include "lite/backends/arm/math/reduce_prod.h"
+#include "lite/backends/arm/math/reduce_sum.h"
 #include "lite/backends/arm/math/scale.h"
+#include "lite/backends/arm/math/scatter.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
 #include "lite/backends/arm/math/sequence_pool_grad.h"
@@ -357,6 +359,15 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
  return exp_ps(vmulq_f32(b, log_ps(a)));
 }

+inline float32x4_t vpaddq_f32(float32x4_t a, float32x4_t b) {
+  float32x4_t vrst;
+  vrst[0] = a[0] + a[1];
+  vrst[1] = a[2] + a[3];
+  vrst[2] = b[0] + b[1];
+  vrst[3] = b[2] + b[3];
+  return vrst;
+}
+
 template <typename T>
 void fill_bias_fc(
    T* tensor, const T* bias, int num, int channel, bool flag_relu);

--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -70,7 +70,8 @@ void bilinear_interp(const float* src,
                     int h_out,
                     float scale_x,
                     float scale_y,
-                     bool with_align) {
+                     bool align_corners,
+                     bool align_mode) {
  int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2];

  int* xofs = buf;
@@ -78,14 +79,13 @@ void bilinear_interp(const float* src,

  float* alpha = reinterpret_cast<float*>(buf + w_out + h_out);
  float* beta = reinterpret_cast<float*>(buf + w_out + h_out + w_out * 2);
+  bool with_align = (align_mode == 0 && !align_corners);

  float fx = 0.0f;
  float fy = 0.0f;
  int sx = 0;
  int sy = 0;
-  if (with_align) {
-    scale_x = static_cast<float>(w_in - 1) / (w_out - 1);
-    scale_y = static_cast<float>(h_in - 1) / (h_out - 1);
+  if (!with_align) {
    // calculate x axis coordinate
    for (int dx = 0; dx < w_out; dx++) {
      fx = dx * scale_x;
@@ -105,8 +105,6 @@ void bilinear_interp(const float* src,
      beta[dy * 2 + 1] = fy;
    }
  } else {
-    scale_x = static_cast<float>(w_in) / w_out;
-    scale_y = static_cast<float>(h_in) / h_out;
    // calculate x axis coordinate
    for (int dx = 0; dx < w_out; dx++) {
      fx = scale_x * (dx + 0.5f) - 0.5f;
@@ -468,15 +466,9 @@ void nearest_interp(const float* src,
                    float* dst,
                    int w_out,
                    int h_out,
-                    float scale_x,
-                    float scale_y,
+                    float scale_w_new,
+                    float scale_h_new,
                    bool with_align) {
-  float scale_w_new = (with_align)
-                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
-                          : (static_cast<float>(w_in) / (w_out));
-  float scale_h_new = (with_align)
-                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
-                          : (static_cast<float>(h_in) / (h_out));
  if (with_align) {
    for (int h = 0; h < h_out; ++h) {
      float* dst_p = dst + h * w_out;
@@ -506,7 +498,8 @@ void interpolate(lite::Tensor* X,
                 int out_height,
                 int out_width,
                 float scale,
-                 bool with_align,
+                 bool align_corners,
+                 bool align_mode,
                 std::string interpolate_type) {
  int in_h = X->dims()[2];
  int in_w = X->dims()[3];
@@ -531,12 +524,12 @@ void interpolate(lite::Tensor* X,
      out_width = out_size_data[1];
    }
  }
-  float height_scale = scale;
-  float width_scale = scale;
-  if (out_width > 0 && out_height > 0) {
-    height_scale = static_cast<float>(out_height / X->dims()[2]);
-    width_scale = static_cast<float>(out_width / X->dims()[3]);
-  }
+  // float height_scale = scale;
+  // float width_scale = scale;
+  // if (out_width > 0 && out_height > 0) {
+  //   height_scale = static_cast<float>(out_height / X->dims()[2]);
+  //   width_scale = static_cast<float>(out_width / X->dims()[3]);
+  // }
  int num_cout = X->dims()[0];
  int c_cout = X->dims()[1];
  Out->Resize({num_cout, c_cout, out_height, out_width});
@@ -551,6 +544,10 @@ void interpolate(lite::Tensor* X,
  int spatial_in = in_h * in_w;
  int spatial_out = out_h * out_w;

+  float scale_x = (align_corners) ? (static_cast<float>(in_w - 1) / (out_w - 1))
+                                  : (static_cast<float>(in_w) / (out_w));
+  float scale_y = (align_corners) ? (static_cast<float>(in_h - 1) / (out_h - 1))
+                                  : (static_cast<float>(in_h) / (out_h));
  if ("Bilinear" == interpolate_type) {
 #pragma omp parallel for
    for (int i = 0; i < count; ++i) {
@@ -560,9 +557,10 @@ void interpolate(lite::Tensor* X,
                      dout + spatial_out * i,
                      out_w,
                      out_h,
-                      1.f / width_scale,
-                      1.f / height_scale,
-                      with_align);
+                      scale_x,
+                      scale_y,
+                      align_corners,
+                      align_mode);
    }
  } else if ("Nearest" == interpolate_type) {
 #pragma omp parallel for
@@ -573,9 +571,9 @@ void interpolate(lite::Tensor* X,
                     dout + spatial_out * i,
                     out_w,
                     out_h,
-                     1.f / width_scale,
-                     1.f / height_scale,
-                     with_align);
+                     scale_x,
+                     scale_y,
+                     align_corners);
    }
  }
 }

--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
@@ -30,7 +30,8 @@ void bilinear_interp(const float* src,
                     int h_out,
                     float scale_x,
                     float scale_y,
-                     bool with_align);
+                     bool align_corners,
+                     bool align_mode);

 void nearest_interp(const float* src,
                    int w_in,
@@ -40,7 +41,7 @@ void nearest_interp(const float* src,
                    int h_out,
                    float scale_x,
                    float scale_y,
-                    bool with_align);
+                    bool align_corners);

 void interpolate(lite::Tensor* X,
                 lite::Tensor* OutSize,
@@ -50,7 +51,8 @@ void interpolate(lite::Tensor* X,
                 int out_height,
                 int out_width,
                 float scale,
-                 bool with_align,
+                 bool align_corners,
+                 bool align_mode,
                 std::string interpolate_type);

 } /* namespace math */

--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2224,7 +2224,13 @@ void pooling3x3s2p1_max(const float* din,
    w_unroll_size -= 1;
    w_unroll_remian = wout - w_unroll_size * 4;
  }
-  float32x4_t vmin = vdupq_n_f32(std::numeric_limits<float>::lowest());
+  int w_needed = wout * 2 + 1;
+  int need_right = w_needed - win - pad_right;
+  int w_2 = need_right > 0 ? w_unroll_remian : w_unroll_remian + 1;
+  w_2 = w_unroll_size <= 0 ? w_2 - 1 : w_2;
+  need_right = wout > 1 ? need_right : 0;
+  float minval = std::numeric_limits<float>::lowest();
+  float32x4_t vmin = vdupq_n_f32(minval);

  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
@@ -2263,6 +2269,11 @@ void pooling3x3s2p1_max(const float* din,
              break;
          }
        }
+
+        auto pr0 = dr0;
+        auto pr1 = dr1;
+        auto pr2 = dr2;
+
        int cnt_num = w_unroll_size;
        if (w_unroll_size > 0) {
 #ifdef __aarch64__
@@ -2316,27 +2327,60 @@ void pooling3x3s2p1_max(const float* din,
                "q11",
                "q15");
 #endif
+
          dr0 -= 8;
          dr1 -= 8;
          dr2 -= 8;
-        }
-        // deal with right pad
-        int wstart = w_unroll_size * 4 * S - P;
-        for (int j = 0; j < w_unroll_remian; ++j) {
-          int wend = std::min(wstart + K, win);
-          int st = wstart > 0 ? wstart : 0;
-          float tmp = dr0[0];
-          for (int i = 0; i < wend - st; i++) {
+        } else {
+          float tmp = minval;
+          int left_ = std::min(2, win);
+          for (int i = 0; i < left_; i++) {
            tmp = std::max(tmp, dr0[i]);
            tmp = std::max(tmp, dr1[i]);
            tmp = std::max(tmp, dr2[i]);
          }
-          *(dr_out++) = tmp;
-          dr0 += S - (st - wstart);
-          dr1 += S - (st - wstart);
-          dr2 += S - (st - wstart);
-          wstart += S;
+
+          dr_out[0] = tmp;
+          dr0++;
+          dr1++;
+          dr2++;
+          dr_out++;
        }
+
+        for (int w = 0; w < w_2 - 1; w += 1) {
+          float32x4_t vr0 = vld1q_f32(dr0);
+          float32x4_t vr1 = vld1q_f32(dr1);
+          float32x4_t vr2 = vld1q_f32(dr2);
+          vr0 = vsetq_lane_f32(minval, vr0, 3);
+          vr1 = vsetq_lane_f32(minval, vr1, 3);
+          vr2 = vsetq_lane_f32(minval, vr2, 3);
+          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
+          vmax1 = vmaxq_f32(vmax1, vr2);
+          float32x2_t vmax2 =
+              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
+          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
+          dr_out[0] = vget_lane_f32(vmax, 0);
+          dr_out++;
+
+          dr0 += 2;
+          dr1 += 2;
+          dr2 += 2;
+        }
+
+        if (need_right) {
+          float tmp = minval;
+          int idx = win - 1;
+          tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
+          tmp = std::max(tmp, pr2[idx]);
+          dr_out[0] = tmp;
+          if (win % 2) {
+            idx = win - 2;
+            tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
+            tmp = std::max(tmp, pr2[idx]);
+            dr_out[0] = tmp;
+          }
+        }
+
        data_out_channel += wout;
      }
    }
@@ -2573,6 +2617,7 @@ void pooling3x3s2p0_max(const float* din,
  int wend = std::min(tmp_val + K, win) - tmp_val;
  float minval = std::numeric_limits<float>::lowest();
  remain = right > 0 ? remain : remain + 1;
+
  for (int n = 0; n < num; ++n) {
    float* data_out_batch = data_out + n * chout * size_channel_out;
    const float* data_in_batch = data_in + n * chin * size_channel_in;
@@ -2663,13 +2708,14 @@ void pooling3x3s2p0_max(const float* din,
              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
          dr_out[0] = vget_lane_f32(vmax, 0);
+
          dr_out++;
          dr0 += 2;
          dr1 += 2;
          dr2 += 2;
        }
-        if (right) {
-          float tmp = dr0[0];  // std::numeric_limits<float>::min();
+        if (right > 0) {
+          float tmp = dr0[0];
          for (int i = 0; i < wend; i++) {
            tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
            tmp = std::max(tmp, dr2[i]);

--- a/lite/backends/arm/math/reduce_sum.cc
+++ b/lite/backends/arm/math/reduce_sum.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/reduce_sum.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void reduce_sum_n<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int chw_size = channel_in * height_in * width_in;
+  if (num_in == 1) {
+    memcpy(dst, src, sizeof(float) * chw_size);
+  } else {
+    int cnt_n = num_in >> 2;
+    int remain_n = num_in & 3;
+    int cnt_chw = chw_size >> 3;
+    int cnt_rem = chw_size & 7;
+    int stride = chw_size << 2;
+    int stride_c = 0;
+    for (int c = 0; c < cnt_chw; c++) {
+      float32x4_t vsum0 = vdupq_n_f32(0.f);
+      float32x4_t vsum1 = vdupq_n_f32(0.f);
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      for (int n = 0; n < cnt_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t vb0 = vld1q_f32(din_ptr1);
+        float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+        float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
+        float32x4_t vc0 = vld1q_f32(din_ptr2);
+        float32x4_t vd0 = vld1q_f32(din_ptr3);
+        float32x4_t vs00 = vaddq_f32(va0, vb0);
+        float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
+        float32x4_t vs10 = vaddq_f32(va1, vb1);
+        float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
+        float32x4_t vs01 = vaddq_f32(vc0, vd0);
+        vsum0 = vaddq_f32(vsum0, vs00);
+        float32x4_t vs11 = vaddq_f32(vc1, vd1);
+        vsum1 = vaddq_f32(vsum1, vs10);
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        vsum0 = vaddq_f32(vsum0, vs01);
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+        vsum1 = vaddq_f32(vsum1, vs11);
+      }
+      for (int n = 0; n < remain_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+        vsum0 = vaddq_f32(vsum0, va0);
+        din_ptr0 += chw_size;
+        vsum1 = vaddq_f32(vsum1, va1);
+      }
+      vst1q_f32(dst, vsum0);
+      dst += 4;
+      stride_c += 8;
+      vst1q_f32(dst, vsum1);
+      dst += 4;
+    }
+    if (cnt_rem > 3) {
+      float32x4_t vsum0 = vdupq_n_f32(0.f);
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      for (int n = 0; n < cnt_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t vb0 = vld1q_f32(din_ptr1);
+        float32x4_t vc0 = vld1q_f32(din_ptr2);
+        float32x4_t vd0 = vld1q_f32(din_ptr3);
+        float32x4_t vs00 = vaddq_f32(va0, vb0);
+        float32x4_t vs01 = vaddq_f32(vc0, vd0);
+        vsum0 = vaddq_f32(vsum0, vs00);
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        vsum0 = vaddq_f32(vsum0, vs01);
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+      }
+      for (int n = 0; n < remain_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        din_ptr0 += chw_size;
+        vsum0 = vaddq_f32(vsum0, va0);
+      }
+      stride_c += 4;
+      vst1q_f32(dst, vsum0);
+      dst += 4;
+      cnt_rem -= 4;
+    }
+    for (int c = 0; c < cnt_rem; c++) {
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      float sum = 0.0;
+      for (int n = 0; n < cnt_n; n++) {
+        float tmp0 = din_ptr0[0] + din_ptr1[0];
+        float tmp1 = din_ptr2[0] + din_ptr3[0];
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        sum += tmp0;
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+        sum += tmp1;
+      }
+      for (int n = 0; n < remain_n; n++) {
+        sum += din_ptr0[0];
+        din_ptr0 += chw_size;
+      }
+      stride_c++;
+      dst[0] = sum;
+      dst++;
+    }
+  }
+}
+
+template <>
+void reduce_sum_c<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  for (int n = 0; n < num_in; ++n) {
+    reduce_sum_n<float>(src, dst, channel_in, 1, height_in, width_in);
+    src += chw_size;
+    dst += hw_size;
+  }
+}
+
+template <>
+void reduce_sum_h<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int nc_size = num_in * channel_in;
+  int hw_size = height_in * width_in;
+  for (int n = 0; n < nc_size; ++n) {
+    reduce_sum_n<float>(src, dst, height_in, 1, 1, width_in);
+    src += hw_size;
+    dst += width_in;
+  }
+}
+
+template <>
+void reduce_sum_w<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int nch_size = num_in * channel_in * height_in;
+  int cnt_w = width_in >> 3;
+  int cnt_n = nch_size >> 2;
+  int rem_w = width_in & 7;
+  int rem_n = nch_size & 3;
+  int stride = 0;
+  int stride_n = width_in << 2;
+  for (int n = 0; n < cnt_n; n++) {
+    const float* din_ptr0 = src + stride;
+    const float* din_ptr1 = din_ptr0 + width_in;
+    const float* din_ptr2 = din_ptr1 + width_in;
+    const float* din_ptr3 = din_ptr2 + width_in;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    int tmp = rem_w;
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
+      float32x4_t vc0 = vld1q_f32(din_ptr2);
+      float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
+      float32x4_t vs0 = vaddq_f32(va0, va1);
+      float32x4_t vd0 = vld1q_f32(din_ptr3);
+      float32x4_t vs1 = vaddq_f32(vb0, vb1);
+      float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
+      float32x4_t vs2 = vaddq_f32(vc0, vc1);
+      din_ptr0 += 8;
+      float32x4_t vs3 = vaddq_f32(vd0, vd1);
+      din_ptr1 += 8;
+      float32x4_t vs00 = vpaddq_f32(vs0, vs1);
+      din_ptr2 += 8;
+      float32x4_t vs01 = vpaddq_f32(vs2, vs3);
+      din_ptr3 += 8;
+      float32x4_t vs = vpaddq_f32(vs00, vs01);
+      vsum = vaddq_f32(vs, vsum);
+    }
+    if (tmp > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      float32x4_t vc0 = vld1q_f32(din_ptr2);
+      float32x4_t vd0 = vld1q_f32(din_ptr3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(va0, vb0);
+      float32x4_t vs01 = vpaddq_f32(vc0, vd0);
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      float32x4_t vs = vpaddq_f32(vs00, vs01);
+      vsum = vaddq_f32(vs, vsum);
+      tmp -= 4;
+    }
+    for (int w = 0; w < tmp; w++) {
+      vsum[0] += *din_ptr0++;
+      vsum[1] += *din_ptr1++;
+      vsum[2] += *din_ptr2++;
+      vsum[3] += *din_ptr3++;
+    }
+    stride += stride_n;
+    vst1q_f32(dst, vsum);
+    dst += 4;
+  }
+  if (rem_n > 1) {
+    const float* din_ptr0 = src + stride;
+    const float* din_ptr1 = din_ptr0 + width_in;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      din_ptr0 += 4;
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      din_ptr1 += 4;
+      float32x4_t va1 = vld1q_f32(din_ptr0);
+      float32x4_t vb1 = vld1q_f32(din_ptr1);
+      float32x4_t vs0 = vpaddq_f32(va0, vb0);
+      din_ptr0 += 4;
+      float32x4_t vs1 = vpaddq_f32(va1, vb1);
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(vs0, vs1);
+      vsum = vaddq_f32(vs00, vsum);
+    }
+    int tmp = rem_w;
+    if (tmp > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(va0, vb0);
+      tmp -= 4;
+      vsum[0] += vs00[0];
+      vsum[2] += vs00[1];
+      vsum[1] += vs00[2];
+      vsum[3] += vs00[3];
+    }
+    vsum[0] += vsum[2];
+    vsum[1] += vsum[3];
+    for (int w = 0; w < tmp; w++) {
+      vsum[0] += *din_ptr0++;
+      vsum[1] += *din_ptr1++;
+    }
+    stride += width_in;
+    *dst++ = vsum[0];
+    stride += width_in;
+    *dst++ = vsum[1];
+    rem_n -= 2;
+  }
+  for (int n = 0; n < rem_n; n++) {
+    const float* din_ptr0 = src + stride;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+      float32x4_t vs0 = vaddq_f32(va0, va1);
+      din_ptr0 += 8;
+      vsum = vaddq_f32(vs0, vsum);
+    }
+    if (rem_w > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      din_ptr0 += 4;
+      vsum = vaddq_f32(vsum, va0);
+      rem_w -= 4;
+    }
+    vsum[1] += vsum[2];
+    for (int w = 0; w < rem_w; w++) {
+      vsum[0] += *din_ptr0++;
+    }
+    vsum[1] += vsum[3];
+    vsum[0] += vsum[1];
+    *dst++ = vsum[0];
+  }
+}
+
+template <>
+void reduce_sum_all<float>(const float* src, float* dst, int all_size) {
+  int cnt_n = all_size >> 4;
+  int rem_n = all_size & 15;
+  int cnt_rem = rem_n >> 2;
+  int rem_rem = rem_n & 3;
+  float32x4_t vsum = vdupq_n_f32(0.f);
+  for (int n = 0; n < cnt_n; n++) {
+    float32x4_t va0 = vld1q_f32(src);
+    float32x4_t va1 = vld1q_f32(src + 4);
+    float32x4_t va2 = vld1q_f32(src + 8);
+    float32x4_t va3 = vld1q_f32(src + 12);
+    src += 16;
+    float32x4_t vs0 = vaddq_f32(va0, va1);
+    float32x4_t vs1 = vaddq_f32(va2, va3);
+    float32x4_t vs = vpaddq_f32(vs0, vs1);
+    vsum = vaddq_f32(vsum, vs);
+  }
+  for (int n = 0; n < cnt_rem; n++) {
+    float32x4_t va0 = vld1q_f32(src);
+    src += 4;
+    vsum = vaddq_f32(vsum, va0);
+  }
+  vsum[1] += vsum[2];
+  for (int n = 0; n < rem_rem; n++) {
+    vsum[0] += *src++;
+  }
+  vsum[1] += vsum[3];
+  vsum[0] += vsum[1];
+  dst[0] = vsum[0];
+}
+
+template <>
+void reduce_sum_nc<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  // reduce nc.
+  int num = num_in * channel_in;
+  int size = height_in * width_in;
+  reduce_sum_n(src, dst, num, size, 1, 1);
+}
+
+template <>
+void reduce_sum_ch<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  int ch_size = channel_in * height_in;
+  int chw_size = ch_size * width_in;
+  for (int n = 0; n < num_in; n++) {
+    reduce_sum_n<float>(src, dst, ch_size, 1, 1, width_in);
+    src += chw_size;
+    dst += width_in;
+  }
+}
+
+template <>
+void reduce_sum_hw<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  int hw_size = height_in * width_in;
+  int nc_size = num_in * channel_in;
+  reduce_sum_w(src, dst, nc_size, 1, 1, hw_size);
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/reduce_sum.h
+++ b/lite/backends/arm/math/reduce_sum.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void reduce_sum_n(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_c(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_h(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_w(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_nc(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_ch(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_hw(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_all(const T* src, T* dst, int all_size);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/arm/math/scatter.cc
+++ b/lite/backends/arm/math/scatter.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/scatter.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void scatter<float>(const int64_t* indexs,
+                    const float* src,
+                    float* dst,
+                    int index_size,
+                    int num,
+                    int size,
+                    bool overwrite) {
+  for (int i = 0; i < num; i++) {
+    const float* din = src + indexs[i] * size;
+    memcpy(dst, din, sizeof(float) * size);
+    dst += size;
+  }
+  if (overwrite) {
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      memcpy(dout, din, sizeof(float) * size);
+    }
+  } else {
+    int cnt = size >> 3;
+    int rem = size & 7;
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      for (int j = 0; j < cnt; j++) {
+        float32x4_t va0 = vld1q_f32(din);
+        float32x4_t vb0 = vld1q_f32(dout);
+        float32x4_t va1 = vld1q_f32(din + 4);
+        float32x4_t vb1 = vld1q_f32(dout + 4);
+        vb0 = vaddq_f32(va0, vb0);
+        vb1 = vaddq_f32(va1, vb1);
+        din += 8;
+        vst1q_f32(dout, vb0);
+        vst1q_f32(dout + 4, vb0);
+        dout += 8;
+      }
+      for (int j = 0; j < rem; j++) {
+        dout[0] += *din++;
+        dout++;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/mobile/src/fpga/KD/dl_engine.hpp
+++ b/mobile/src/fpga/KD/dl_engine.hpp
@@ -13,21 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
-#include <stdio.h>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class DLEngine {
- public:
-  static DLEngine& get_instance() {
-    static DLEngine s_instance;
-    return s_instance;
-  }
-
- private:
-  DLEngine();
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
+#include <stdint.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void scatter(const int64_t* indexs,
+             const T* updates,
+             T* dst,
+             int index_size,
+             int num,
+             int size,
+             bool overwrite);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
@@ -23,7 +23,7 @@ int TargetWrapperBM::device_id_ = 0;
 std::map<int, void*> TargetWrapperBM::bm_hds_;

 size_t TargetWrapperBM::num_devices() {
-  int count = 0;
+  int count = 1;
  bm_status_t ret = bm_dev_getcount(&count);
  CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
                            << static_cast<int>(ret);

--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3(
  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);

  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1);

 #ifdef BIASE_CH
  CL_DTYPE4 output =
@@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[1] = select(
@@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3(
                    input,
                    sampler,
                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 ||
                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[2] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                << 15));

  inputs[3] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
                 in_pos_in_one_block.y >= input_height)
                << 15));
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-        CL_DTYPE4 input1 = inputs[3];
-        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-        printf(" input4 3 - %v4hlf \n", in);
-        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-  }
-  */

  inputs[4] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
@@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
                           pos_in_input_block.y + in_pos_in_one_block.y)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
                 in_pos_in_one_block.y >= input_height)
                << 15));

@@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  inputs[7] = select(
@@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3(
                    input,
                    sampler,
                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 ||
                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  inputs[8] = select(
      READ_IMG_TYPE(CL_DTYPE_CHAR,
                    input,
                    sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
      (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                << 15));

  CL_DTYPE4 filters[9];
@@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3(

  /*

-  if (output_pos.x == 112 && output_pos.y == 0) {
+  if (output_pos.x == 0 && output_pos.y == 0) {

      for (int i = 0; i < 9; ++i) {
          CL_DTYPE4 input1 = inputs[i];
          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-          printf(" input4 %d - %v4hlf \n", i, in);
+          printf(" input4[%d]: %v4hlf \n", i, in);
+      }
+      for (int i = 0; i < 9; ++i) {
+          CL_DTYPE4 filters1 = filters[i];
+          float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w);
+          printf(" weights4[%d]: %v4hlf \n", i, f);
      }
-
      float4 out = (float4)(output.x, output.y, output.z, output.w);
      printf(" depth wise output output4 = %v4hlf \n", out);
      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);

--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
@@ -24,6 +24,7 @@
 #include <sys/types.h>
 #elif defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <windows.h>
 #else
 #include <unistd.h>

--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -61,3 +61,5 @@ math_library(search_fc DEPS blas dynload_mklml)
 # cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 # cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 # cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+math_library(box_coder DEPS math_function)
+math_library(prior_box DEPS math_function)
--- a/lite/backends/x86/math/box_coder.cc
+++ b/lite/backends/x86/math/box_coder.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/box_coder.h"
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void encode_center_size(const int64_t row,  // N
+                        const int64_t col,  // M
+                        const int64_t len,  // 4
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      float prior_box_width = prior_box_data[j * len + 2] -
+                              prior_box_data[j * len] + (normalized == false);
+      float prior_box_height = prior_box_data[j * len + 3] -
+                               prior_box_data[j * len + 1] +
+                               (normalized == false);
+      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[j * len + 1] + prior_box_height / 2;
+
+      float target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      float target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      float target_box_width = target_box_data[i * len + 2] -
+                               target_box_data[i * len] + (normalized == false);
+      float target_box_height = target_box_data[i * len + 3] -
+                                target_box_data[i * len + 1] +
+                                (normalized == false);
+
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+    }
+  }
+
+  if (prior_box_var_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          int prior_var_offset = j * len;
+          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+        }
+      }
+    }
+  } else if (!(variance.empty())) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          output[offset + k] /= variance[k];
+        }
+      }
+    }
+  }
+}
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      float var_data[4] = {1., 1., 1., 1.};
+      float* var_ptr = var_data;
+      size_t offset = i * col * len + j * len;
+      int prior_box_offset = axis == 0 ? j * len : i * len;
+
+      float prior_box_width = prior_box_data[prior_box_offset + 2] -
+                              prior_box_data[prior_box_offset] +
+                              (normalized == false);
+      float prior_box_height = prior_box_data[prior_box_offset + 3] -
+                               prior_box_data[prior_box_offset + 1] +
+                               (normalized == false);
+      float prior_box_center_x =
+          prior_box_data[prior_box_offset] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
+
+      float target_box_center_x = 0, target_box_center_y = 0;
+      float target_box_width = 0, target_box_height = 0;
+      int prior_var_offset = axis == 0 ? j * len : i * len;
+      if (var_size == 2) {
+        std::memcpy(
+            var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
+      } else if (var_size == 1) {
+        var_ptr = const_cast<float*>(variance.data());
+      }
+      float box_var_x = *var_ptr;
+      float box_var_y = *(var_ptr + 1);
+      float box_var_w = *(var_ptr + 2);
+      float box_var_h = *(var_ptr + 3);
+
+      target_box_center_x =
+          box_var_x * target_box_data[offset] * prior_box_width +
+          prior_box_center_x;
+      target_box_center_y =
+          box_var_y * target_box_data[offset + 1] * prior_box_height +
+          prior_box_center_y;
+      target_box_width =
+          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+      target_box_height =
+          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
+
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] =
+          target_box_center_x + target_box_width / 2 - (normalized == false);
+      output[offset + 3] =
+          target_box_center_y + target_box_height / 2 - (normalized == false);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/box_coder.h
+++ b/lite/backends/x86/math/box_coder.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void encode_center_size(const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -161,7 +161,7 @@ class ContextProjectFunctor {
                      sequence_width});

        if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
+          int padding_rows = (std::min)(
              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));

          for (int k = 0; k < padding_rows; ++k) {
@@ -180,10 +180,10 @@ class ContextProjectFunctor {
        }
        if (down_pad > 0) {  // add down pad
          int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
+              (std::max)(
+                  0, (sequence_height - context_start - context_length) + 1) +
              1;
-          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_begin = (std::max)(0, context_start - sequence_height);
          int padding_size =
              sequence_height - context_start >= context_length
                  ? 1

--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }

            T ele = pool_process.initial();
@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }
            int pool_size = (exclusive || adaptive)
                                ? (hend - hstart) * (wend - wstart)
@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          int hend = (std::min)(hstart + ksize_height, input_height);
+          hstart = (std::max)(hstart, 0);
          for (int pw = 0; pw < output_width; ++pw) {
            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            int wend = (std::min)(wstart + ksize_width, input_width);
+            wstart = (std::max)(wstart, 0);

            bool stop = false;
            for (int h = hstart; h < hend && !stop; ++h) {
@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }
              int output_idx = (pd * output_height + ph) * output_width + pw;
              T ele = pool_process.initial();
@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }

              int pool_size =
@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
      for (int c = 0; c < output_channels; ++c) {
        for (int pd = 0; pd < output_depth; ++pd) {
          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          int dend = (std::min)(dstart + ksize_depth, input_depth);
+          dstart = (std::max)(dstart, 0);
          for (int ph = 0; ph < output_height; ++ph) {
            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            int hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
            for (int pw = 0; pw < output_width; ++pw) {
              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              int wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
              bool stop = false;
              for (int d = dstart; d < dend && !stop; ++d) {
                for (int h = hstart; h < hend && !stop; ++h) {
@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
            hend = AdaptEndIndex(ph, input_height, output_height);
          } else {
            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
          }
          for (int pw = 0; pw < output_width; ++pw) {
            if (adaptive) {
@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
              wend = AdaptEndIndex(pw, input_width, output_width);
            } else {
              wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
            }

            T1 ele = static_cast<T1>(-FLT_MAX);
@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
            dend = AdaptEndIndex(pd, input_depth, output_depth);
          } else {
            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
          }
          for (int ph = 0; ph < output_height; ++ph) {
            if (adaptive) {
@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
              hend = AdaptEndIndex(ph, input_height, output_height);
            } else {
              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
            }
            for (int pw = 0; pw < output_width; ++pw) {
              if (adaptive) {
@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
                wend = AdaptEndIndex(pw, input_width, output_width);
              } else {
                wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
              }

              int output_idx = (pd * output_height + ph) * output_width + pw;

--- a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,91 +12,51 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef DENSITY_PRIORBOX_OP
-#pragma once
-
-#include <operators/kernel/prior_box_kernel.h>
+#include "lite/backends/x86/math/prior_box.h"
 #include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename P>
-void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  auto densities = param.Densities();
-  auto fixed_ratios = param.FixedRatios();
-
-  auto fixed_sizes = param.FixedSizes();
-
-  const auto &variances = param.Variances();
-  const bool &clip = param.Clip();
-
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = 0;
-  for (size_t i = 0; i < densities.size(); ++i) {
-    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-  }
-
-  auto box_dim = output_variances->dims();
-
-  output_boxes->Resize({feature_height, feature_width, num_priors, 4});
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data) {
  int step_average = static_cast<int>((step_width + step_height) * 0.5);

  std::vector<float> sqrt_fixed_ratios;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
  for (size_t i = 0; i < fixed_ratios.size(); i++) {
    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
  }

-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t h = 0; h < feature_height; ++h) {
+    for (int64_t w = 0; w < feature_width; ++w) {
      float center_x = (w + offset) * step_width;
      float center_y = (h + offset) * step_height;
-      int idx = 0;
+      int64_t offset = (h * feature_width + w) * num_priors * 4;
+      // Generate density prior boxes with fixed sizes.
      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
        auto fixed_size = fixed_sizes[s];
        int density = densities[s];
@@ -111,51 +71,48 @@ void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
            for (int dj = 0; dj < density; ++dj) {
              float center_x_temp = density_center_x + dj * shift;
              float center_y_temp = density_center_y + di * shift;
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   0] =
-                  std::max((center_x_temp - box_width_ratio / 2.) / img_width,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   1] =
-                  std::max((center_y_temp - box_height_ratio / 2.) / img_height,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   2] =
-                  std::min((center_x_temp + box_width_ratio / 2.) / img_width,
-                           1.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   3] =
-                  std::min((center_y_temp + box_height_ratio / 2.) / img_height,
-                           1.);
-              idx++;
+              boxes_data[offset++] = std::max(
+                  (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
+              boxes_data[offset++] = std::max(
+                  (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
+              boxes_data[offset++] = std::min(
+                  (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
+              boxes_data[offset++] = std::min(
+                  (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
            }
          }
        }
      }
    }
  }
+  //! clip the prior's coordinate such that it is within [0, 1]
  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
+    int channel_size = feature_height * feature_width * num_priors * 4;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int d = 0; d < channel_size; ++d) {
+      boxes_data[d] = std::min(std::max(boxes_data[d], 0.f), 1.f);
+    }
+  }
+//! set the variance.
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      for (int i = 0; i < num_priors; ++i) {
+        int idx = ((h * feature_width + w) * num_priors + i) * 4;
+        vars_data[idx++] = variances[0];
+        vars_data[idx++] = variances[1];
+        vars_data[idx++] = variances[2];
+        vars_data[idx++] = variances[3];
      }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
    }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
  }
 }

-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/prior_box.h
+++ b/lite/backends/x86/math/prior_box.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength(
  uint64_t seq_num = seq_offset.size() - 1;
  uint64_t max_seq_len = 0;
  for (size_t i = 0; i < seq_num; ++i) {
-    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
+    max_seq_len = (std::max)(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
  }
  return max_seq_len;
 }

--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -26,7 +26,7 @@ namespace x86 {

 static void SetNumThreads(int num_threads) {
 #ifdef PADDLE_WITH_MKLML
-  int real_num_threads = std::max(num_threads, 1);
+  int real_num_threads = (std::max)(num_threads, 1);
  x86::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
 #endif
@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin,
  }

 #ifdef PADDLE_WITH_MKLML
-  int64_t num_threads = std::min(GetMaxThreads(), end - begin);
+  int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
  if (num_threads > 1) {
 #pragma omp parallel num_threads(num_threads)
    {
      int64_t tid = omp_get_thread_num();
      int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
      int64_t begin_tid = begin + tid * chunk_size;
-      f(begin_tid, std::min(end, chunk_size + begin_tid));
+      f(begin_tid, (std::min)(end, chunk_size + begin_tid));
    }
    return;
  }

--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -18,6 +18,27 @@
 namespace paddle {
 namespace lite {

+void XPUScratchPad::Reserve(size_t new_size) {
+  if (new_size <= size_) {
+    return;
+  }
+
+  if (!is_l3_) {
+    TargetWrapperXPU::Free(addr_);
+    addr_ = TargetWrapperXPU::Malloc(new_size);
+    size_ = new_size;
+  } else {
+    CHECK(false) << "Not supported if is_l3_ == true";
+  }
+}
+
+void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
+  if (!sp->is_l3_) {
+    TargetWrapperXPU::Free(sp->addr_);
+  }
+  delete sp;
+}
+
 void* TargetWrapperXPU::Malloc(size_t size) {
  void* ptr{nullptr};
  XPU_CALL(xpu_malloc(&ptr, size));
@@ -51,7 +72,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
    ptr = TargetWrapperXPU::Malloc(size);
  }
  CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
-  return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
+  return XPUScratchPadGuard(new XPUScratchPad(ptr, size, use_l3));
 }

 std::string TargetWrapperXPU::multi_encoder_precision;  // NOLINT

--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -37,19 +37,19 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

 struct XPUScratchPad {
-  XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {}
+  XPUScratchPad(void* addr, size_t size, bool is_l3)
+      : addr_(addr), size_(size), is_l3_(is_l3) {}
+
+  // XXX(miaotianxiang): |size_| increases monotonically
+  void Reserve(size_t new_size);

  void* addr_{nullptr};
+  size_t size_{0};
  bool is_l3_{false};
 };

 struct XPUScratchPadDeleter {
-  void operator()(XPUScratchPad* sp) const {
-    if (!sp->is_l3_) {
-      XPU_CALL(xpu_free(sp->addr_));
-    }
-    delete sp;
-  }
+  void operator()(XPUScratchPad* sp) const;
 };

 using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -2,7 +2,7 @@ if (WITH_TESTING)
    lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
 endif()
 lite_cc_library(target_wrapper SRCS target_wrapper.cc
-  DEPS target_wrapper_host place
+  DEPS target_wrapper_host place fbs_headers
  X86_DEPS target_wrapper_x86
  CUDA_DEPS target_wrapper_cuda
  XPU_DEPS target_wrapper_xpu

--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -176,6 +176,9 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
        case 0xd0a:
          arch_type = kA75;
          break;
+        case 0xd0d:
+          arch_type = kA77;
+          break;
        case 0xd40:
          arch_type = kA76;
          break;
@@ -637,6 +640,20 @@ void DeviceInfo::SetArchInfo(int argc, ...) {

 bool DeviceInfo::SetCPUInfoByName() {
  /* Snapdragon */
+  if (dev_name_.find("KONA") != std::string::npos) {  // 865
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA77, kA55);
+    SetCacheInfo(0, 2, 192 * 1024, 256 * 1024);
+    SetCacheInfo(1, 2, 768 * 1024, 512 * 1024);
+    SetCacheInfo(2, 1, 4 * 1024 * 1024);
+    SetFP16Info(1, 1);
+    SetDotInfo(2, 1, 1);
+    return true;
+  }
  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
    core_num_ = 8;
    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -17,6 +17,7 @@
 #include <cstdarg>
 #include <string>
 #include <vector>
+#include "lite/api/paddle_api.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
 #ifdef LITE_WITH_MLU
@@ -27,6 +28,7 @@
 namespace paddle {
 namespace lite {

+using L3CacheSetMethod = lite_api::L3CacheSetMethod;
 #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))

 typedef enum {
@@ -38,6 +40,8 @@ typedef enum {
  kA73 = 73,
  kA75 = 75,
  kA76 = 76,
+  kA77 = 77,
+  kA78 = 78,
  kARMArch_UNKOWN = -1
 } ARMArch;

@@ -65,11 +69,41 @@ class DeviceInfo {
  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1) {
+    l3_cache_method_ = method;
+    absolute_l3cache_size_ = absolute_val;
+    // Realloc memory for sgemm in this context.
+    workspace_.clear();
+    workspace_.Resize({llc_size()});
+    workspace_.mutable_data<int8_t>();
+  }
+
  int llc_size() const {
-    auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
+    auto size = absolute_l3cache_size_;
+    switch (l3_cache_method_) {
+      // kDeviceL3Cache = 0, use the system L3 Cache size, best performance.
+      case L3CacheSetMethod::kDeviceL3Cache:
+        size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
                                             : L2_cache_[active_ids_[0]];
+        break;
+      // kDeviceL2Cache = 1, use the system L2 Cache size, trade off performance
+      // with less memory consumption.
+      case L3CacheSetMethod::kDeviceL2Cache:
+        size = L2_cache_[active_ids_[0]];
+        break;
+      // kAbsolute = 2, use the external setting.
+      case L3CacheSetMethod::kAbsolute:
+        break;
+      default:
+        LOG(FATAL) << "Error: unknown l3_cache_method_ !";
+    }
    return size > 0 ? size : 512 * 1024;
  }
+
  bool has_dot() const { return dot_[active_ids_[0]]; }
  bool has_fp16() const { return fp16_[active_ids_[0]]; }

@@ -121,6 +155,10 @@ class DeviceInfo {
  void RequestPowerRandHighMode(int shift_num, int thread_num);
  void RequestPowerRandLowMode(int shift_num, int thread_num);

+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  L3CacheSetMethod l3_cache_method_{L3CacheSetMethod::kDeviceL3Cache};
+  int absolute_l3cache_size_{-1};
  DeviceInfo() = default;
 };
 #endif  // LITE_WITH_ARM

--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -13,6 +13,7 @@
 // limitations under the License.

 #pragma once
+#include <algorithm>
 #include <string>
 #include "lite/api/paddle_place.h"
 #include "lite/core/target_wrapper.h"
@@ -140,20 +141,21 @@ class Buffer {
 #ifdef LITE_WITH_OPENCL
  template <typename T>
  void ResetLazyImage2D(TargetType target,
-                        const size_t img_w,
-                        const size_t img_h,
+                        const size_t img_w_req,
+                        const size_t img_h_req,
                        void* host_ptr = nullptr) {
-    if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h || host_ptr != nullptr) {
+    if (target != target_ || cl_image2d_width_ < img_w_req ||
+        cl_image2d_height_ < img_h_req || host_ptr != nullptr) {
      CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
+      cl_image2d_width_ = std::max(cl_image2d_width_, img_w_req);
+      cl_image2d_height_ = std::max(cl_image2d_height_, img_h_req);
      Free();
-      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
+      data_ = TargetWrapperCL::MallocImage<T>(
+          cl_image2d_width_, cl_image2d_height_, host_ptr);
      target_ = target;
-      space_ = sizeof(T) * img_w * img_h *
+      space_ = sizeof(T) * cl_image2d_width_ * cl_image2d_height_ *
               4;  // un-used for opencl Image2D, 4 for RGBA,
      cl_use_image2d_ = true;
-      cl_image2d_width_ = img_w;
-      cl_image2d_height_ = img_h;
    }
  }
 #endif

--- a/lite/core/memory_test.cc
+++ b/lite/core/memory_test.cc
@@ -28,6 +28,12 @@ TEST(memory, test) {
  ASSERT_TRUE(buf_cuda);
  TargetFree(TARGET(kCUDA), buf_cuda);
 #endif
+
+#ifdef LITE_WITH_OPENCL
+  auto* buf_cl = TargetMalloc(TARGET(kOpenCL), 10);
+  ASSERT_TRUE(buf_cl);
+  TargetFree(TARGET(kOpenCL), buf_cl);
+#endif
 }

 }  // namespace lite

--- a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
@@ -244,6 +244,7 @@ class XPUConv2dBlock0Fuser : public FuseBase {

    std::string output_name = "";
    if (_with_relu) {
+      op_desc.SetAttr("act_type", std::string{"relu"});
      output_name = matched.at("relu_out")->arg()->name;
    } else {
      output_name = matched.at("bn_out")->arg()->name;
@@ -433,6 +434,7 @@ class XPUConv2dBlock1Fuser : public FuseBase {
        TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
    scope->NewTensor(max_output_name);
    op_desc.SetOutput("OutputMax", {max_output_name});
+    op_desc.SetAttr("act_type", std::string{"relu"});

    auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
    auto& valid_places = conv_old->valid_places();

--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
--- a/lite/core/mir/fusion/conv_conv_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
+#include <list>
 #include <memory>
 #include <vector>
 #include "lite/core/mir/fusion/conv_conv_fuser.h"
@@ -27,13 +28,10 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
  // initialze fuser params
  std::vector<bool> conv_has_bias_cases{true, false};
  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
-  bool has_fp32 = false;
  bool has_int8 = false;
+  bool has_weight_quant = false;
  for (auto& place : graph->valid_places()) {
    if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
-      if (place.precision == PRECISION(kFloat)) {
-        has_fp32 = true;
-      }
      if (place.precision == PRECISION(kInt8)) {
        has_int8 = true;
      }
@@ -42,8 +40,18 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
      return;
    }
  }
+  const std::list<mir::Node>& nodes = graph->nodes();
+  for (auto& node : nodes) {
+    if (node.IsStmt()) {
+      auto* op_info = (node.stmt())->op_info();
+      if (op_info->HasAttr("quantization_type")) {
+        has_weight_quant = true;
+        break;
+      }
+    }
+  }
  // only support arm-fp32
-  if (has_int8 || (has_fp32 && has_int8)) {
+  if (has_int8 || has_weight_quant) {
    return;
  }
  // only support fp32 fusion

--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -61,5 +61,4 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {

 REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass,
                  paddle::lite::mir::QuantDequantFusePass)
-    .BindTargets({TARGET(kAny)})
-    .BindKernel("calib");
+    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
          int cur_life =
              (*lifecycles)[TargetToStr(target_type)][var_name].second;
          (*lifecycles)[TargetToStr(target_type)][var_name].second =
-              std::max(max_lifecycle_, cur_life);
+              (std::max)(max_lifecycle_, cur_life);
        }
      }
      ++max_lifecycle_;

--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass {
    float final_score{-1.};
    Place winner_place{places[0]};
    const int kMax =
-        std::numeric_limits<core::KernelPickFactor::value_type>::max();
+        (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
    size_t place_size = places.size();

    // NOTE: We compare kernel's place with place in valid_places to select the

--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,8 +17,6 @@
 #include <cmath>

 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -82,8 +82,11 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
  // not a good judge, but don't find the source of this issue from
  // static_pick_kernel_pass
  // to this pass.
+  auto is_host = [](TargetType x) -> bool {
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
+  };
  auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
-  if (in_arg_type->target() == TARGET(kARM) &&
+  if (is_host(in_arg_type->target()) &&
      in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
    return;
  }

--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -233,67 +233,98 @@ bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const {
  return false;
 }

-bool OpInfo::HasInputScale(const std::string &input_name) const {
+bool OpInfo::HasInputScale(const std::string &name, bool is_scale_name) const {
+  bool res = false;
+  if (is_scale_name) {
+    res = HasAttr(name);
+  } else {
    std::string argname;
    int index;
-  if (GetInputArgname(input_name, &argname) &&
-      GetInputIndex(input_name, &index)) {
-    return HasAttr(argname + to_string(index) + "_scale");
-  } else {
-    return false;
+    if (GetInputArgname(name, &argname) && GetInputIndex(name, &index)) {
+      res = HasAttr(argname + to_string(index) + "_scale");
+    }
  }
+  return res;
 }

-bool OpInfo::HasOutputScale(const std::string &output_name) const {
+bool OpInfo::HasOutputScale(const std::string &name, bool is_scale_name) const {
+  bool res = false;
+  if (is_scale_name) {
+    res = HasAttr(name);
+  } else {
    std::string argname;
    int index;
-  if (GetOutputArgname(output_name, &argname) &&
-      GetOutputIndex(output_name, &index)) {
-    return HasAttr(argname + to_string(index) + "_scale");
-  } else {
-    return false;
+    if (GetOutputArgname(name, &argname) && GetOutputIndex(name, &index)) {
+      res = HasAttr(argname + to_string(index) + "_scale");
+    }
  }
+  return res;
 }

-void OpInfo::SetInputScale(const std::string &input_name,
-                           const std::vector<float> &scale_value) {
+void OpInfo::SetInputScale(const std::string &name,
+                           const std::vector<float> &scale_value,
+                           bool is_scale_name) {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
    std::string argname;
    int index;
-  CHECK(GetInputArgname(input_name, &argname));
-  CHECK(GetInputIndex(input_name, &index));
+    CHECK(GetInputArgname(name, &argname));
+    CHECK(GetInputIndex(name, &index));
    CHECK(scale_value.size() > 0)
        << "Error in SetInputScale: the scales should not be empty";
-  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
-                              scale_value);
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  SetAttr<std::vector<float>>(scale_name, scale_value);
 }

-void OpInfo::SetOutputScale(const std::string &output_name,
-                            const std::vector<float> &scale_value) {
+void OpInfo::SetOutputScale(const std::string &name,
+                            const std::vector<float> &scale_value,
+                            bool is_scale_name) {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
    std::string argname;
    int index;
-  CHECK(GetOutputArgname(output_name, &argname));
-  CHECK(GetOutputIndex(output_name, &index));
+    CHECK(GetOutputArgname(name, &argname));
+    CHECK(GetOutputIndex(name, &index));
    CHECK(scale_value.size() > 0)
        << "Error in SetOutputScale: the scales should not be empty";
-  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
-                              scale_value);
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  SetAttr<std::vector<float>>(scale_name, scale_value);
 }

-std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
+std::vector<float> OpInfo::GetInputScale(const std::string &name,
+                                         bool is_scale_name) const {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
    std::string argname;
    int index;
-  CHECK(GetInputArgname(input_name, &argname));
-  CHECK(GetInputIndex(input_name, &index));
-  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+    CHECK(GetInputArgname(name, &argname));
+    CHECK(GetInputIndex(name, &index));
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  return GetAttr<std::vector<float>>(scale_name);
 }

-std::vector<float> OpInfo::GetOutputScale(
-    const std::string &output_name) const {
+std::vector<float> OpInfo::GetOutputScale(const std::string &name,
+                                          bool is_scale_name) const {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
    std::string argname;
    int index;
-  CHECK(GetOutputArgname(output_name, &argname));
-  CHECK(GetOutputIndex(output_name, &index));
-  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+    CHECK(GetOutputArgname(name, &argname));
+    CHECK(GetOutputIndex(name, &index));
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  return GetAttr<std::vector<float>>(scale_name);
 }

 }  // namespace lite

--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -251,19 +251,31 @@ class OpInfo : public cpp::OpDesc {
  bool GetInputIndex(const std::string &input_name, int *out) const;
  bool GetOutputIndex(const std::string &output_name, int *out) const;

-  bool HasInputScale(const std::string &input_name) const;
-  bool HasOutputScale(const std::string &output_name) const;
+  // If a quantized op has two input argname (X, Y) and one output
+  // argname (Out). The scales of input argname X are saved in op desc as
+  // (X0_scale, scale_value_0), (X1_scale, scale_value_1)...
+  // The following APIs get or set the quantized scale in op_desc.
+  // If use the input or output name, the is_scale_name should be false.
+  // If use the scale_name such as (X0_scale, scale_value_0),
+  // the is_scale_name should be true.
+  bool HasInputScale(const std::string &name, bool is_scale_name = false) const;
+  bool HasOutputScale(const std::string &name,
+                      bool is_scale_name = false) const;

  void SetInputScale(const std::string &input_name,
-                     const std::vector<float> &scale_value);
+                     const std::vector<float> &scale_value,
+                     bool is_scale_name = false);
  void SetOutputScale(const std::string &output_name,
-                      const std::vector<float> &scale_value);
+                      const std::vector<float> &scale_value,
+                      bool is_scale_name = false);

  // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
  // Otherwise, all input and output scales are scalar, but we save these
  // as vecotr.
-  std::vector<float> GetInputScale(const std::string &input_name) const;
-  std::vector<float> GetOutputScale(const std::string &output_name) const;
+  std::vector<float> GetInputScale(const std::string &name,
+                                   bool is_scale_name = false) const;
+  std::vector<float> GetOutputScale(const std::string &name,
+                                    bool is_scale_name = false) const;
 };

 }  // namespace lite

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -80,8 +80,8 @@ class Optimizer {
    InitControlFlowOpUnusedInputsAndOutputsEliminatePass();

    if (passes.empty() || passes.size() == 1) {
-      std::vector<std::string> passes_local{
-          {"lite_quant_dequant_fuse_pass",         //
+      std::vector<std::string> passes_local{{
+          "lite_quant_dequant_fuse_pass",         //
          "weight_quantization_preprocess_pass",  //
          "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
          "lite_conv_bn_fuse_pass",               //
@@ -108,6 +108,7 @@ class Optimizer {
 #endif
          "identity_dropout_eliminate_pass",
          "__xpu__resnet_fuse_pass",
+          "__xpu__resnet_d_fuse_pass",
          "__xpu__resnet_cbam_fuse_pass",
          "__xpu__conv2d_fuse_pass",
          "__xpu__conv2d_link_previous_out_max_pass",
@@ -169,8 +170,9 @@ class Optimizer {
          "runtime_context_assign_pass",
          "argument_type_display_pass",
          "lite_reshape_fuse_pass",
-
-           "memory_optimize_pass"}};
+          "memory_optimize_pass"  // you can comment this line when enable
+                                  // PRECISION_PROFILE
+      }};

      if (passes.size() == 1) {
        // multi_stream_analysis_pass must be in the front of

--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -18,10 +18,18 @@
 * of each kernel.
 */
 #pragma once
+
+#include <sys/time.h>
+#include <time.h>
+
 #include <cmath>
+#include <cstdlib>
+#include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/utils/io.h"
 #ifdef LITE_WITH_X86
 #include "lite/fluid/float16.h"
 #endif
@@ -40,14 +48,50 @@ namespace paddle {
 namespace lite {
 namespace profile {

+static const std::string get_date_str() {
+  struct tm tm_time;
+  time_t timestamp = time(NULL);
+  localtime_r(&timestamp, &tm_time);
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+
+  // print date / time
+  std::string date_str =
+      std::to_string(1900 + tm_time.tm_year) +
+      std::to_string(1 + tm_time.tm_mon) + std::to_string(tm_time.tm_mday) +
+      '_' + std::to_string(tm_time.tm_hour) + std::to_string(tm_time.tm_min) +
+      std::to_string(tm_time.tm_sec) + '_' + std::to_string(tv.tv_usec / 1000);
+  return date_str;
+}
+
+inline std::string generate_valid_tensor_name(const std::string& name) {
+  std::string new_name("");
+  for (size_t i = 0; i < name.length(); ++i) {
+    if (name[i] != '/') {
+      new_name += name[i];
+    } else {
+      new_name += "_";
+    }
+  }
+  return new_name;
+}
+
 template <typename dtype>
-static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
-  if (locate.find('/') != std::string::npos) {
-    return false;
+static bool write_tensorfile(
+    const Tensor* tensor,
+    const std::string& tensor_name,
+    const std::string prefix_path = "/storage/emulated/0/") {
+  std::string new_tensor_name = generate_valid_tensor_name(tensor_name);
+  if (tensor_name.find('/') != std::string::npos) {
+    LOG(ERROR) << "--> tensor name is abnormal with '\\':" << tensor_name
+               << " !!!, replace with '_'," << new_tensor_name
+               << new_tensor_name;
  }
-  FILE* fp = fopen(locate.c_str(), "w");
+
+  std::string tensor_save_path = prefix_path + new_tensor_name + ".txt";
+  FILE* fp = fopen(tensor_save_path.c_str(), "w");
  if (fp == nullptr) {
-    LOG(ERROR) << "file open field " << locate;
+    LOG(ERROR) << "failed open file " << tensor_save_path;
    return false;
  } else {
    const dtype* data = tensor->data<dtype>();
@@ -56,19 +100,23 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
    }
  }
  fclose(fp);
+  LOG(INFO) << "write tensor " << tensor_name
+            << " to file:" << tensor_save_path;
  return true;
 }

-static bool write_precision_summary_tofile(const std::string& string,
-                                           const std::string& log_dir = "") {
-  if (log_dir == "") {
-    LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
-              << log_dir;
+static bool write_precision_summary_tofile(
+    const std::string& string, const std::string& summary_log_dir = "") {
+  if (summary_log_dir == "") {
+    LOG(INFO) << "The `summary_log_dir` of precision summary file is not set. "
+                 "summary_log_dir:"
+              << summary_log_dir;
    return false;
  }
-  FILE* fp = fopen(log_dir.c_str(), "a");
+
+  FILE* fp = fopen(summary_log_dir.c_str(), "a");
  if (fp == nullptr) {
-    LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
+    LOG(INFO) << "Open precision summary file:" << summary_log_dir << "failed.";
    return false;
  } else {
    fprintf(fp, "%s\n", string.c_str());
@@ -85,7 +133,14 @@ class PrecisionProfiler {
    std::string inst_precison_str = GetInstPrecision(inst);
  }

-  PrecisionProfiler() {}
+  PrecisionProfiler() {
+    MkDirRecur(log_dir_);
+    const char* write_to_file_raw =
+        std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
+    write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
+                                ? atoi(write_to_file_raw) > 0
+                                : false;
+  }

  std::string GetSummaryHeader() {
    using std::setw;
@@ -102,9 +157,9 @@ class PrecisionProfiler {
       << " " << setw(15) << left << "std_deviation"
       << " " << setw(15) << left << "ave_grow_rate*" << std::endl;

-    // write to file with path: `log_dir`
-    if (log_dir_ != "") {
-      FILE* fp = fopen(log_dir_.c_str(), "a");
+    // write to file with path: `summary_log_dir`
+    if (summary_log_dir_ != "") {
+      FILE* fp = fopen(summary_log_dir_.c_str(), "a");
      std::string header_str{ss.str()};
      fprintf(fp, "%s\n", header_str.c_str());
      fclose(fp);
@@ -112,6 +167,18 @@ class PrecisionProfiler {
    return ss.str();
  }

+  std::string GetSummaryTail() {
+    STL::stringstream ss;
+    ss << "[note]" << std::endl;
+    ss << "1. `ave_grow_rate`: show the sequence value of tensor when std_dev "
+          "& mean are same."
+       << std::endl;
+    ss << "2. Enable write each output tensor to file: `export "
+          "PADDLELITE_PRECISION_WRITE_TO_FILE=1` on ADB command line."
+       << std::endl;
+    return ss.str();
+  }
+
  template <typename T>
  double compute_mean(const T* in, const size_t length) {
    double sum = 0.;
@@ -157,6 +224,17 @@ class PrecisionProfiler {
    return false;
  }

+  std::string rename_out_for_mem_reuse_pass(const std::string& old_name) {
+    if (out_tensor_names_map.find(old_name) == out_tensor_names_map.end()) {
+      out_tensor_names_map[old_name] = 1;
+    } else {
+      ++out_tensor_names_map[old_name];
+    }
+    std::string new_name =
+        old_name + "_" + std::to_string(out_tensor_names_map[old_name]);
+    return new_name;
+  }
+
  void compute_tensor_precision_info(const Tensor* in,
                                     TargetType target_type,
                                     PrecisionType precision_type,
@@ -180,7 +258,7 @@ class PrecisionProfiler {
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        case PRECISION(kAny): {
@@ -189,7 +267,7 @@ class PrecisionProfiler {
          *std_dev =
              compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
          *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        case PRECISION(kInt8): {
@@ -198,7 +276,7 @@ class PrecisionProfiler {
          *std_dev =
              compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
          *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
          return;
        }
        case PRECISION(kInt32): {
@@ -207,7 +285,7 @@ class PrecisionProfiler {
          *std_dev = compute_standard_deviation<int32_t>(
              ptr, in->numel(), true, *mean);
          *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
          return;
        }
        case PRECISION(kInt64): {
@@ -254,7 +332,14 @@ class PrecisionProfiler {
              real_out_v.data(), in->numel(), true, *mean);
          *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
                                                            real_out_v.size());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
+          real_out_t->Resize(in->dims());
+          float* real_out_data = real_out_t->mutable_data<float>();
+          memcpy(real_out_data,
+                 real_out_v.data(),
+                 real_out_v.size() * sizeof(float));
+          write_result_to_file&& write_tensorfile<float>(
+              real_out_t.get(), name, log_dir_);
          return;
        }
        case DATALAYOUT(kNCHW): {
@@ -269,7 +354,14 @@ class PrecisionProfiler {
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
+          real_out_t->Resize(in->dims());
+          float* real_out_data = real_out_t->mutable_data<float>();
+          memcpy(real_out_data,
+                 in_data_v.data(),
+                 in_data_v.size() * sizeof(float));
+          write_result_to_file&& write_tensorfile<float>(
+              real_out_t.get(), name, log_dir_);
          return;
        }
        default:
@@ -296,7 +388,7 @@ class PrecisionProfiler {
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        case PRECISION(kInt32): {
@@ -311,7 +403,7 @@ class PrecisionProfiler {
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<int>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        case PRECISION(kInt64): {
@@ -326,7 +418,7 @@ class PrecisionProfiler {
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        case PRECISION(kFP16): {
@@ -347,7 +439,7 @@ class PrecisionProfiler {
              in_data_v.data(), in->numel(), true, *mean);
          *ave_grow_rate =
              compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
          return;
        }
        default:
@@ -372,12 +464,12 @@ class PrecisionProfiler {
    using std::left;
    using std::fixed;
    STL::stringstream ss;
-    bool write_result_to_file = false;

    VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
            << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
            << PrecisionToStr(inst->kernel()->precision()) << "/"
-            << DataLayoutToStr(inst->kernel()->layout());
+            << DataLayoutToStr(inst->kernel()->layout())
+            << ", write_result_to_file_:" << write_result_to_file_;

    std::string kernel_repr = inst->op()->op_info()->Repr();
    std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
@@ -404,6 +496,7 @@ class PrecisionProfiler {
          std::string mean_str{"unused"};
          std::string std_dev_str{"unused"};
          std::string ave_grow_rate_str{"unused"};
+          std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);

          if (!is_unused(tout)) {
            compute_tensor_precision_info(tout,
@@ -413,14 +506,14 @@ class PrecisionProfiler {
                                          &mean,
                                          &std_dev,
                                          &ave_grow_rate,
-                                          out_name,
-                                          write_result_to_file);
+                                          new_out_name,
+                                          write_result_to_file_);
            mean_str = std::to_string(mean);
            std_dev_str = std::to_string(std_dev);
            ave_grow_rate_str = std::to_string(ave_grow_rate);
          }
          std::string kernel_info = op_name + ":" + kernel_place;
-          std::string output_arg_info = out_name + ":" +
+          std::string output_arg_info = new_out_name + ":" +
                                        TargetToStr(type->target()) + "/" +
                                        PrecisionToStr(type->precision()) +
                                        "/" + DataLayoutToStr(type->layout());
@@ -441,6 +534,7 @@ class PrecisionProfiler {
            std::string mean_str{"unused"};
            std::string std_dev_str{"unused"};
            std::string ave_grow_rate_str{"unused"};
+            std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);

            if (!is_unused(tout)) {
              compute_tensor_precision_info(tout,
@@ -450,14 +544,14 @@ class PrecisionProfiler {
                                            &mean,
                                            &std_dev,
                                            &ave_grow_rate,
-                                            out_name,
-                                            write_result_to_file);
+                                            new_out_name,
+                                            write_result_to_file_);
              mean_str = std::to_string(mean);
              std_dev_str = std::to_string(std_dev);
              ave_grow_rate_str = std::to_string(ave_grow_rate);
            }
            std::string kernel_info = op_name + ":" + kernel_place;
-            std::string output_arg_info = out_name + ":" +
+            std::string output_arg_info = new_out_name + ":" +
                                          TargetToStr(type->target()) + "/" +
                                          PrecisionToStr(type->precision()) +
                                          "/" + DataLayoutToStr(type->layout());
@@ -471,12 +565,16 @@ class PrecisionProfiler {
        }
      }
    }
-    write_precision_summary_tofile(ss.str(), log_dir_);
+    write_precision_summary_tofile(ss.str(), summary_log_dir_);
    return ss.str();
  }

 private:
-  std::string log_dir_{"/storage/emulated/0/precision.log"};
+  std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() +
+                       "/"};
+  std::string summary_log_dir_{log_dir_ + "precision_summary.log"};
+  std::map<std::string, size_t> out_tensor_names_map;
+  bool write_result_to_file_{false};
 };

 }  // namespace profile

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -302,7 +302,9 @@ void RuntimeProgram::Run() {
  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
 #endif
 #ifdef LITE_WITH_PRECISION_PROFILE
-  LOG(INFO) << "\n" << precision_profiler_summary;
+  LOG(INFO) << "\n"
+            << precision_profiler_summary
+            << inst_precision_profiler.GetSummaryTail();
 #endif
 }


--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -29,6 +29,21 @@ int64_t ShapeProduction(const shape_t& shape) {
  return res;
 }

+std::string ShapePrint(const std::vector<shape_t>& shapes) {
+  std::string shapes_str{""};
+  for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
+    auto shape = shapes[shape_idx];
+    std::string shape_str;
+    for (auto i : shape) {
+      shape_str += std::to_string(i) + ",";
+    }
+    shapes_str += shape_str;
+    shapes_str +=
+        (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
+  }
+  return shapes_str;
+}
+
 std::string ShapePrint(const shape_t& shape) {
  std::string shape_str{""};
  for (auto i : shape) {
@@ -37,6 +52,37 @@ std::string ShapePrint(const shape_t& shape) {
  return shape_str;
 }

+std::vector<std::string> split_string(const std::string& str_in) {
+  std::vector<std::string> str_out;
+  std::string tmp_str = str_in;
+  while (!tmp_str.empty()) {
+    size_t next_offset = tmp_str.find(":");
+    str_out.push_back(tmp_str.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return str_out;
+}
+
+std::vector<int64_t> get_shape(const std::string& str_shape) {
+  std::vector<int64_t> shape;
+  std::string tmp_str = str_shape;
+  while (!tmp_str.empty()) {
+    int dim = atoi(tmp_str.data());
+    shape.push_back(dim);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return shape;
+}
+
 template <typename T>
 double compute_mean(const T* in, const size_t length) {
  double sum = 0.;
@@ -70,7 +116,7 @@ inline double GetCurrentUS() {
 }

 void RunModel(std::string model_dir,
-              const shape_t& input_shape,
+              const std::vector<shape_t>& input_shapes,
              size_t repeats,
              size_t warmup,
              size_t print_output_elem,
@@ -111,12 +157,19 @@ void RunModel(std::string model_dir,
      CreatePaddlePredictor<MobileConfig>(config);

  // 3. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(
-      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
+  std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
  }

  // 4. Run predictor
@@ -142,7 +195,7 @@ void RunModel(std::string model_dir,
  }
  avg_duration = sum_duration / static_cast<float>(repeats);
  std::cout << "\n======= benchmark summary =======\n"
-            << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n"
+            << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
            << "model_dir:" << model_dir << "\n"
            << "warmup:" << warmup << "\n"
            << "repeats:" << repeats << "\n"
@@ -184,18 +237,19 @@ void RunModel(std::string model_dir,
 }

 int main(int argc, char** argv) {
-  shape_t input_shape{1, 3, 224, 224};  // shape_t ==> std::vector<int64_t>
+  std::vector<std::string> str_input_shapes;
+  std::vector<shape_t> input_shapes{
+      {1, 3, 224, 224}};  // shape_t ==> std::vector<int64_t>
+
  int repeats = 10;
  int warmup = 10;
  int print_output_elem = 0;

-  if (argc > 2 && argc < 9) {
+  if (argc > 2 && argc < 6) {
    std::cerr << "usage: ./" << argv[0] << "\n"
              << "  <naive_buffer_model_dir>\n"
-              << "  <input_n>\n"
-              << "  <input_c>\n"
-              << "  <input_h>\n"
-              << "  <input_w>\n"
+              << "  <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
+                 "1,3,224,224:1,5 for 2 inputs\n"
              << "  <repeats>\n"
              << "  <warmup>\n"
              << "  <print_output>" << std::endl;
@@ -203,14 +257,19 @@ int main(int argc, char** argv) {
  }

  std::string model_dir = argv[1];
-  if (argc >= 9) {
-    input_shape[0] = atoi(argv[2]);
-    input_shape[1] = atoi(argv[3]);
-    input_shape[2] = atoi(argv[4]);
-    input_shape[3] = atoi(argv[5]);
-    repeats = atoi(argv[6]);
-    warmup = atoi(argv[7]);
-    print_output_elem = atoi(argv[8]);
+  if (argc >= 6) {
+    input_shapes.clear();
+    std::string raw_input_shapes = argv[2];
+    std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
+    str_input_shapes = split_string(raw_input_shapes);
+    for (size_t i = 0; i < str_input_shapes.size(); ++i) {
+      std::cout << "input shape: " << str_input_shapes[i] << std::endl;
+      input_shapes.push_back(get_shape(str_input_shapes[i]));
+    }
+
+    repeats = atoi(argv[3]);
+    warmup = atoi(argv[4]);
+    print_output_elem = atoi(argv[5]);
  }
  // set arm power mode:
  // 0 for big cluster, high performance
@@ -220,7 +279,7 @@ int main(int argc, char** argv) {
  size_t power_mode = 0;

  RunModel(
-      model_dir, input_shape, repeats, warmup, print_output_elem, power_mode);
+      model_dir, input_shapes, repeats, warmup, print_output_elem, power_mode);

  return 0;
 }
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
@@ -128,7 +128,7 @@ bool test_convert(bool cv_run,
  for (int i = 0; i < test_iter; i++) {
    clock_t begin = clock();
    // resize default linear
-    image_preprocess.imageConvert(src, resize_lite);
+    image_preprocess.image_convert(src, resize_lite);
    clock_t end = clock();
    to_lite += (end - begin);
  }
@@ -226,7 +226,7 @@ bool test_flip(bool cv_run,
  for (int i = 0; i < test_iter; i++) {
    clock_t begin = clock();
    // resize default linear
-    image_preprocess.imageFlip(src, resize_lite);
+    image_preprocess.image_flip(src, resize_lite);
    clock_t end = clock();
    to_lite += (end - begin);
  }
@@ -330,7 +330,7 @@ bool test_rotate(bool cv_run,
  for (int i = 0; i < test_iter; i++) {
    clock_t begin = clock();
    // resize default linear
-    image_preprocess.imageRotate(src, resize_lite);
+    image_preprocess.image_rotate(src, resize_lite);
    clock_t end = clock();
    to_lite += (end - begin);
  }
@@ -426,7 +426,7 @@ bool test_resize(bool cv_run,
  for (int i = 0; i < test_iter; i++) {
    clock_t begin = clock();
    // resize default linear
-    image_preprocess.imageResize(src, resize_lite);
+    image_preprocess.image_resize(src, resize_lite);
    clock_t end = clock();
    to_lite += (end - begin);
  }
@@ -526,7 +526,7 @@ bool test_crop(bool cv_run,
  std::cout << "lite compute:" << std::endl;
  for (int i = 0; i < test_iter; i++) {
    clock_t begin = clock();
-    image_preprocess.imageCrop(
+    image_preprocess.image_crop(
        src, resize_lite, dstFormat, srcw, srch, left_x, left_y, dstw, dsth);
    clock_t end = clock();
    to_lite += (end - begin);

--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
@@ -88,13 +88,13 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
  uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
  uint8_t* resize_ptr = new uint8_t[width * height * 3];
  // do convert bgr--rgb
-  img_process.imageConvert(img_ptr, rgb_ptr);
+  img_process.image_convert(img_ptr, rgb_ptr);
  // do resize
-  img_process.imageResize(rgb_ptr, resize_ptr);
+  img_process.image_resize(rgb_ptr, resize_ptr);
  // data--tensor and normalize
  float means[3] = {103.94f, 116.78f, 123.68f};
  float scales[3] = {0.017f, 0.017f, 0.017f};
-  img_process.image2Tensor(
+  img_process.image_to_tensor(
      resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
  float* data = dstTensor.mutable_data<float>();
 #else

--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
 lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})


 set(apu_subgraph_bridges
@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
        subgraph_bridge_softmax_op_apu
        subgraph_bridge_fc_op_apu
        subgraph_bridge_pool_op_apu
+	subgraph_bridge_conv_transpose_op_apu
+	subgraph_bridge_concat_op_apu
        CACHE INTERNAL "apu_subgraph_bridges")

 message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
--- a/lite/kernels/apu/bridges/concat_op.cc
+++ b/lite/kernels/apu/bridges/concat_op.cc
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
--- a/lite/kernels/apu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
    LOG(FATAL) << "[APU] Node" << name << " is redefined.";
    return -1;
  } else {
-    VLOG(3) << " Add: " << name << " : " << node->index();
+    VLOG(5) << " Add: " << name << " : " << node->index();
    auto ret = nodes_.insert(
        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
    CHECK(ret.second);

--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
 USE_SUBGRAPH_BRIDGE(fc, kAPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
 USE_SUBGRAPH_BRIDGE(softmax, kAPU);
+USE_SUBGRAPH_BRIDGE(concat, kAPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  xType.dimensions = &dims_x[0];
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
-    // input operand already exist
    x_node = graph->Get(x_name);
    VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
  } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: input
+    NeuronModel_addOperand(model, &xType);  // Operand 0: input
    x_node = graph->Add(x_name, dims_x);
  }
  VLOG(3) << "input_scale size: " << input_scale
@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType betaType;
  betaType.type = NEURON_FLOAT32;
  betaType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &betaType);  // 1: beta
+  NeuronModel_addOperand(model, &betaType);  // Operand 1: beta
  std::shared_ptr<Node> beta_node = nullptr;
  beta_node = graph->Add(x_name + "_beta", dims_int32);

@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  NeuronOperandType axisType;
  axisType.type = NEURON_INT32;
  axisType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &axisType);  // 2: axis
+  NeuronModel_addOperand(model, &axisType);  // Operand 2: axis
  std::shared_ptr<Node> axis_node = nullptr;
  axis_node = graph->Add(x_name + "_axis", dims_int32);

@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  outType.zeroPoint = 128;
  outType.dimensionCount = x_dims.size();
  outType.dimensions = &dims_x[0];
-  NeuronModel_addOperand(model, &outType);  // 3: output
+  NeuronModel_addOperand(model, &outType);  // Operand 3: output
  std::shared_ptr<Node> out_node = nullptr;
  out_node = graph->Add(out_name, dims_x);
  VLOG(3) << "out_scale: " << out_scale;
@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  axis_val[0] = axis;
  NeuronModel_setOperandValue(
      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
-  std::vector<uint32_t> addInIndex = {
-      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addInIndex = {x_node->index(),      // 0: input
+                                      beta_node->index(),   // 1: beta
+                                      axis_node->index()};  // 2: axis
  std::vector<uint32_t> addOutIndex = {out_node->index()};
  int neuron_errCode = NeuronModel_addOperation(model,
                                                NEURON_SOFTMAX,

--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
--- a/lite/kernels/arm/reduce_sum_compute.cc
+++ b/lite/kernels/arm/reduce_sum_compute.cc
--- a/lite/kernels/arm/reduce_sum_compute.h
+++ b/lite/kernels/arm/reduce_sum_compute.h
--- a/lite/kernels/arm/scatter_compute.cc
+++ b/lite/kernels/arm/scatter_compute.cc
--- a/lite/kernels/arm/scatter_compute.h
+++ b/lite/kernels/arm/scatter_compute.h
--- a/lite/kernels/arm/sequence_expand_as_compute.cc
+++ b/lite/kernels/arm/sequence_expand_as_compute.cc
--- a/lite/kernels/arm/sequence_expand_as_compute.h
+++ b/lite/kernels/arm/sequence_expand_as_compute.h
--- a/lite/kernels/bm/bridges/box_coder_op.cc
+++ b/lite/kernels/bm/bridges/box_coder_op.cc
--- a/lite/kernels/bm/bridges/cast_op.cc
+++ b/lite/kernels/bm/bridges/cast_op.cc
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
--- a/lite/kernels/bm/bridges/yolo_box_op.cc
+++ b/lite/kernels/bm/bridges/yolo_box_op.cc
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
--- a/lite/kernels/host/crf_decoding_compute.h
+++ b/lite/kernels/host/crf_decoding_compute.h
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
--- a/lite/kernels/host/retinanet_detection_output_compute.cc
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
--- a/lite/kernels/x86/box_coder_compute.cc
+++ b/lite/kernels/x86/box_coder_compute.cc
--- a/lite/kernels/x86/box_coder_compute.h
+++ b/lite/kernels/x86/box_coder_compute.h
--- a/lite/kernels/x86/density_prior_box_compute.cc
+++ b/lite/kernels/x86/density_prior_box_compute.cc
--- a/lite/kernels/x86/density_prior_box_compute.h
+++ b/lite/kernels/x86/density_prior_box_compute.h
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
--- a/lite/kernels/xpu/sequence_unpad_compute.cc
+++ b/lite/kernels/xpu/sequence_unpad_compute.cc
--- a/lite/kernels/xpu/sequence_unpad_compute.h
+++ b/lite/kernels/xpu/sequence_unpad_compute.h
--- a/lite/model_parser/base/block_desc.h
+++ b/lite/model_parser/base/block_desc.h
--- a/lite/model_parser/base/op_desc.h
+++ b/lite/model_parser/base/op_desc.h
--- a/lite/model_parser/base/param_desc.h
+++ b/lite/model_parser/base/param_desc.h
--- a/lite/model_parser/base/program_desc.h
+++ b/lite/model_parser/base/program_desc.h
--- a/lite/model_parser/base/traits.h
+++ b/lite/model_parser/base/traits.h
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
--- a/lite/model_parser/flatbuffers/io.h
+++ b/lite/model_parser/flatbuffers/io.h
--- a/lite/model_parser/flatbuffers/io_test.cc
+++ b/lite/model_parser/flatbuffers/io_test.cc
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
--- a/lite/model_parser/naive_buffer/CMakeLists.txt
+++ b/lite/model_parser/naive_buffer/CMakeLists.txt
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
--- a/lite/operators/__xpu__conv2d_op.cc
+++ b/lite/operators/__xpu__conv2d_op.cc
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
--- a/lite/operators/scatter_op.cc
+++ b/lite/operators/scatter_op.cc
--- a/lite/operators/scatter_op.h
+++ b/lite/operators/scatter_op.h
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
--- a/lite/tests/api/ILSVRC2012_utility.h
+++ b/lite/tests/api/ILSVRC2012_utility.h
--- a/lite/tests/api/bert_utility.h
+++ b/lite/tests/api/bert_utility.h
--- a/lite/tests/api/test_bert_fp32_xpu.cc
+++ b/lite/tests/api/test_bert_fp32_xpu.cc
--- a/lite/tests/api/test_ernie_fp32_xpu.cc
+++ b/lite/tests/api/test_ernie_fp32_xpu.cc
--- a/lite/tests/api/test_googlenet_fp32_xpu.cc
+++ b/lite/tests/api/test_googlenet_fp32_xpu.cc
--- a/lite/tests/api/test_resnet50_fp32_xpu.cc
+++ b/lite/tests/api/test_resnet50_fp32_xpu.cc
--- a/lite/tests/api/test_vgg19_fp32_xpu.cc
+++ b/lite/tests/api/test_vgg19_fp32_xpu.cc
--- a/lite/tests/benchmark/CMakeLists.txt
+++ b/lite/tests/benchmark/CMakeLists.txt
--- a/lite/tests/benchmark/README.md
+++ b/lite/tests/benchmark/README.md
--- a/lite/tests/benchmark/build_benchmark_ops.sh
+++ b/lite/tests/benchmark/build_benchmark_ops.sh
--- a/lite/tests/benchmark/get_latency_lookup_table.py
+++ b/lite/tests/benchmark/get_latency_lookup_table.py
--- a/lite/tests/benchmark/latency_lookup_table.txt
+++ b/lite/tests/benchmark/latency_lookup_table.txt
--- a/lite/tests/benchmark/ops.txt
+++ b/lite/tests/benchmark/ops.txt
--- a/lite/tests/benchmark/src/get_activation_latency.cc
+++ b/lite/tests/benchmark/src/get_activation_latency.cc
--- a/lite/tests/benchmark/src/get_batchnorm_latency.cc
+++ b/lite/tests/benchmark/src/get_batchnorm_latency.cc
--- a/lite/tests/benchmark/src/get_conv_latency.cc
+++ b/lite/tests/benchmark/src/get_conv_latency.cc
--- a/lite/tests/benchmark/src/get_fc_latency.cc
+++ b/lite/tests/benchmark/src/get_fc_latency.cc
--- a/lite/tests/benchmark/src/get_pooling_latency.cc
+++ b/lite/tests/benchmark/src/get_pooling_latency.cc
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
--- a/lite/tests/cv/anakin/CMakeLists.txt
+++ b/lite/tests/cv/anakin/CMakeLists.txt
--- a/lite/tests/cv/anakin/bgr_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_flip_hwc.cc
--- a/lite/tests/cv/anakin/bgr_resize.cc
+++ b/lite/tests/cv/anakin/bgr_resize.cc
--- a/lite/tests/cv/anakin/bgr_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_rotate_hwc.cc
--- a/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
--- a/lite/tests/cv/anakin/bgra_flip_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_flip_hwc.cc
--- a/lite/tests/cv/anakin/bgra_resize.cc
+++ b/lite/tests/cv/anakin/bgra_resize.cc
--- a/lite/tests/cv/anakin/bgra_rotate_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_rotate_hwc.cc
--- a/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
+++ b/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
--- a/lite/tests/cv/anakin/cv_utils.cc
+++ b/lite/tests/cv/anakin/cv_utils.cc
--- a/lite/tests/cv/anakin/cv_utils.h
+++ b/lite/tests/cv/anakin/cv_utils.h
--- a/lite/tests/cv/anakin/nv12_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgr.cc
--- a/lite/tests/cv/anakin/nv12_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv12_to_bgra.cc
--- a/lite/tests/cv/anakin/nv21_resize.cc
+++ b/lite/tests/cv/anakin/nv21_resize.cc
--- a/lite/tests/cv/anakin/nv21_to_bgr.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgr.cc
--- a/lite/tests/cv/anakin/nv21_to_bgra.cc
+++ b/lite/tests/cv/anakin/nv21_to_bgra.cc
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
--- a/lite/tests/cv/image_profiler_test.cc
+++ b/lite/tests/cv/image_profiler_test.cc
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
--- a/lite/tests/kernels/box_coder_compute_test.cc
+++ b/lite/tests/kernels/box_coder_compute_test.cc
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
--- a/lite/tests/kernels/interp_compute_test.cc
+++ b/lite/tests/kernels/interp_compute_test.cc
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
--- a/lite/tests/kernels/reduce_sum_compute_test.cc
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
--- a/lite/tests/kernels/scatter_compute_test.cc
+++ b/lite/tests/kernels/scatter_compute_test.cc
--- a/lite/tests/kernels/sequence_expand_as_compute_test.cc
+++ b/lite/tests/kernels/sequence_expand_as_compute_test.cc
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
--- a/lite/tools/build_windows.bat
+++ b/lite/tools/build_windows.bat
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
--- a/lite/utils/charconv.h
+++ b/lite/utils/charconv.h
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
--- a/lite/utils/macros.h
+++ b/lite/utils/macros.h
--- a/mobile/.clang-format
+++ b/mobile/.clang-format
--- a/mobile/.clang-tidy
+++ b/mobile/.clang-tidy
--- a/mobile/.gitignore
+++ b/mobile/.gitignore
--- a/mobile/.pre-commit-config.yaml
+++ b/mobile/.pre-commit-config.yaml
--- a/mobile/.travis.yml
+++ b/mobile/.travis.yml
--- a/mobile/.travis/pre-commit-job.sh
+++ b/mobile/.travis/pre-commit-job.sh
--- a/mobile/CMakeLists.txt
+++ b/mobile/CMakeLists.txt
--- a/mobile/CONTRIBUTING.md
+++ b/mobile/CONTRIBUTING.md
--- a/mobile/Dockerfile
+++ b/mobile/Dockerfile
--- a/mobile/LICENSE
+++ b/mobile/LICENSE
--- a/mobile/README.md
+++ b/mobile/README.md
--- a/mobile/benchmark/arm_benchmark.md
+++ b/mobile/benchmark/arm_benchmark.md
--- a/mobile/benchmark/metal_benchmark.md
+++ b/mobile/benchmark/metal_benchmark.md
--- a/mobile/demo/ReadMe.md
+++ b/mobile/demo/ReadMe.md
--- a/mobile/demo/getDemo.sh
+++ b/mobile/demo/getDemo.sh
--- a/mobile/doc/build.md
+++ b/mobile/doc/build.md
--- a/mobile/doc/design_doc.md
+++ b/mobile/doc/design_doc.md
--- a/mobile/doc/development_android.md
+++ b/mobile/doc/development_android.md
--- a/mobile/doc/development_android_GPU.md
+++ b/mobile/doc/development_android_GPU.md
--- a/mobile/doc/development_arm_linux.md
+++ b/mobile/doc/development_arm_linux.md
--- a/mobile/doc/development_fpga.md
+++ b/mobile/doc/development_fpga.md
--- a/mobile/doc/development_ios.md
+++ b/mobile/doc/development_ios.md
--- a/mobile/doc/quantification.md
+++ b/mobile/doc/quantification.md
--- a/mobile/src/common/common.h
+++ b/mobile/src/common/common.h
--- a/mobile/src/common/enforce.h
+++ b/mobile/src/common/enforce.h
--- a/mobile/src/common/log.h
+++ b/mobile/src/common/log.h
--- a/mobile/src/common/threadpool.h
+++ b/mobile/src/common/threadpool.h
--- a/mobile/src/common/type_define.h
+++ b/mobile/src/common/type_define.h
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
--- a/mobile/src/common/util.cpp
+++ b/mobile/src/common/util.cpp
--- a/mobile/src/common/util.h
+++ b/mobile/src/common/util.h
--- a/mobile/src/common/variant.h
+++ b/mobile/src/common/variant.h
--- a/mobile/src/fpga/KD/alignment.h
+++ b/mobile/src/fpga/KD/alignment.h
--- a/mobile/src/fpga/KD/context.hpp
+++ b/mobile/src/fpga/KD/context.hpp
--- a/mobile/src/fpga/KD/dl_engine.cpp
+++ b/mobile/src/fpga/KD/dl_engine.cpp
--- a/mobile/src/fpga/KD/float16.hpp
+++ b/mobile/src/fpga/KD/float16.hpp
--- a/mobile/src/fpga/KD/layout.hpp
+++ b/mobile/src/fpga/KD/layout.hpp
--- a/mobile/src/fpga/KD/llapi/bias_scale.cpp
+++ b/mobile/src/fpga/KD/llapi/bias_scale.cpp
--- a/mobile/src/fpga/KD/llapi/bias_scale.h
+++ b/mobile/src/fpga/KD/llapi/bias_scale.h
--- a/mobile/src/fpga/KD/llapi/config.h
+++ b/mobile/src/fpga/KD/llapi/config.h
--- a/mobile/src/fpga/KD/llapi/filter.cpp
+++ b/mobile/src/fpga/KD/llapi/filter.cpp
--- a/mobile/src/fpga/KD/llapi/filter.h
+++ b/mobile/src/fpga/KD/llapi/filter.h
--- a/mobile/src/fpga/KD/llapi/image.cpp
+++ b/mobile/src/fpga/KD/llapi/image.cpp
--- a/mobile/src/fpga/KD/llapi/image.h
+++ b/mobile/src/fpga/KD/llapi/image.h
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
+++ b/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.h
+++ b/mobile/src/fpga/KD/llapi/zynqmp_api.h
--- a/mobile/src/fpga/KD/pe.hpp
+++ b/mobile/src/fpga/KD/pe.hpp
--- a/mobile/src/fpga/KD/pe_params.hpp
+++ b/mobile/src/fpga/KD/pe_params.hpp
--- a/mobile/src/fpga/KD/pes/concat_pe.hpp
+++ b/mobile/src/fpga/KD/pes/concat_pe.hpp
--- a/mobile/src/fpga/KD/pes/conv_pe.hpp
+++ b/mobile/src/fpga/KD/pes/conv_pe.hpp
--- a/mobile/src/fpga/KD/pes/conv_process.hpp
+++ b/mobile/src/fpga/KD/pes/conv_process.hpp
--- a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
--- a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
+++ b/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
--- a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
+++ b/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
--- a/mobile/src/fpga/KD/pes/input_pe.hpp
+++ b/mobile/src/fpga/KD/pes/input_pe.hpp
--- a/mobile/src/fpga/KD/pes/math_func_neon.h
+++ b/mobile/src/fpga/KD/pes/math_func_neon.h
--- a/mobile/src/fpga/KD/pes/output_pe.hpp
+++ b/mobile/src/fpga/KD/pes/output_pe.hpp
--- a/mobile/src/fpga/KD/pes/pooling_pe.hpp
+++ b/mobile/src/fpga/KD/pes/pooling_pe.hpp
--- a/mobile/src/fpga/KD/pes/softmax_pe.cpp
+++ b/mobile/src/fpga/KD/pes/softmax_pe.cpp
--- a/mobile/src/fpga/KD/pes/softmax_pe.hpp
+++ b/mobile/src/fpga/KD/pes/softmax_pe.hpp
--- a/mobile/src/fpga/KD/shape.hpp
+++ b/mobile/src/fpga/KD/shape.hpp
--- a/mobile/src/fpga/KD/tensor.hpp
+++ b/mobile/src/fpga/KD/tensor.hpp
--- a/mobile/src/fpga/KD/tensor_util.cpp
+++ b/mobile/src/fpga/KD/tensor_util.cpp
--- a/mobile/src/fpga/KD/tensor_util.hpp
+++ b/mobile/src/fpga/KD/tensor_util.hpp
--- a/mobile/src/fpga/V1/api.cpp
+++ b/mobile/src/fpga/V1/api.cpp
--- a/mobile/src/fpga/V1/api.h
+++ b/mobile/src/fpga/V1/api.h
--- a/mobile/src/fpga/V1/bias_scale.cpp
+++ b/mobile/src/fpga/V1/bias_scale.cpp
--- a/mobile/src/fpga/V1/bias_scale.h
+++ b/mobile/src/fpga/V1/bias_scale.h
--- a/mobile/src/fpga/V1/deconv_bias_scale.cpp
+++ b/mobile/src/fpga/V1/deconv_bias_scale.cpp
--- a/mobile/src/fpga/V1/deconv_bias_scale.h
+++ b/mobile/src/fpga/V1/deconv_bias_scale.h
--- a/mobile/src/fpga/V1/deconv_filter.cpp
+++ b/mobile/src/fpga/V1/deconv_filter.cpp
--- a/mobile/src/fpga/V1/deconv_filter.h
+++ b/mobile/src/fpga/V1/deconv_filter.h
--- a/mobile/src/fpga/V1/filter.cpp
+++ b/mobile/src/fpga/V1/filter.cpp
--- a/mobile/src/fpga/V1/filter.h
+++ b/mobile/src/fpga/V1/filter.h
--- a/mobile/src/fpga/V1/image.cpp
+++ b/mobile/src/fpga/V1/image.cpp
--- a/mobile/src/fpga/V1/image.h
+++ b/mobile/src/fpga/V1/image.h
--- a/mobile/src/fpga/V1/pe.cpp
+++ b/mobile/src/fpga/V1/pe.cpp
--- a/mobile/src/fpga/V2/api.cpp
+++ b/mobile/src/fpga/V2/api.cpp
--- a/mobile/src/fpga/V2/api.h
+++ b/mobile/src/fpga/V2/api.h
--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ b/mobile/src/fpga/V2/bias_scale.cpp
--- a/mobile/src/fpga/V2/bias_scale.h
+++ b/mobile/src/fpga/V2/bias_scale.h
--- a/mobile/src/fpga/V2/deconv_bias_scale.cpp
+++ b/mobile/src/fpga/V2/deconv_bias_scale.cpp
--- a/mobile/src/fpga/V2/deconv_bias_scale.h
+++ b/mobile/src/fpga/V2/deconv_bias_scale.h
--- a/mobile/src/fpga/V2/deconv_filter.cpp
+++ b/mobile/src/fpga/V2/deconv_filter.cpp
--- a/mobile/src/fpga/V2/deconv_filter.h
+++ b/mobile/src/fpga/V2/deconv_filter.h
--- a/mobile/src/fpga/V2/filter.cpp
+++ b/mobile/src/fpga/V2/filter.cpp
--- a/mobile/src/fpga/V2/filter.h
+++ b/mobile/src/fpga/V2/filter.h
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
--- a/mobile/src/fpga/V2/image.h
+++ b/mobile/src/fpga/V2/image.h
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
--- a/mobile/src/fpga/common/config.h
+++ b/mobile/src/fpga/common/config.h
--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
--- a/mobile/src/fpga/common/driver.h
+++ b/mobile/src/fpga/common/driver.h
--- a/mobile/src/fpga/common/fpga_common.cpp
+++ b/mobile/src/fpga/common/fpga_common.cpp
--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
--- a/mobile/src/fpga/common/pe.h
+++ b/mobile/src/fpga/common/pe.h
--- a/mobile/src/framework/CMakeLists.txt
+++ b/mobile/src/framework/CMakeLists.txt
--- a/mobile/src/framework/attribute.cpp
+++ b/mobile/src/framework/attribute.cpp
--- a/mobile/src/framework/attribute.h
+++ b/mobile/src/framework/attribute.h
--- a/mobile/src/framework/cl/cl_deleter.h
+++ b/mobile/src/framework/cl/cl_deleter.h
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ b/mobile/src/framework/cl/cl_engine.cpp
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
--- a/mobile/src/framework/cl/cl_half.cpp
+++ b/mobile/src/framework/cl/cl_half.cpp
--- a/mobile/src/framework/cl/cl_half.h
+++ b/mobile/src/framework/cl/cl_half.h
--- a/mobile/src/framework/cl/cl_helper.h
+++ b/mobile/src/framework/cl/cl_helper.h
--- a/mobile/src/framework/cl/cl_image.cpp
+++ b/mobile/src/framework/cl/cl_image.cpp
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
--- a/mobile/src/framework/cl/cl_image_converter.cpp
+++ b/mobile/src/framework/cl/cl_image_converter.cpp
--- a/mobile/src/framework/cl/cl_image_converter.h
+++ b/mobile/src/framework/cl/cl_image_converter.h
--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
--- a/mobile/src/framework/cl/cl_tensor.h
+++ b/mobile/src/framework/cl/cl_tensor.h
--- a/mobile/src/framework/cl/cl_tool.cpp
+++ b/mobile/src/framework/cl/cl_tool.cpp
--- a/mobile/src/framework/cl/cl_tool.h
+++ b/mobile/src/framework/cl/cl_tool.h
--- a/mobile/src/framework/context.cpp
+++ b/mobile/src/framework/context.cpp
--- a/mobile/src/framework/context.h
+++ b/mobile/src/framework/context.h
--- a/mobile/src/framework/data_layout.h
+++ b/mobile/src/framework/data_layout.h
--- a/mobile/src/framework/data_type.cpp
+++ b/mobile/src/framework/data_type.cpp
--- a/mobile/src/framework/data_type.h
+++ b/mobile/src/framework/data_type.h
--- a/mobile/src/framework/ddim.cpp
+++ b/mobile/src/framework/ddim.cpp
--- a/mobile/src/framework/ddim.h
+++ b/mobile/src/framework/ddim.h
--- a/mobile/src/framework/dim.h
+++ b/mobile/src/framework/dim.h
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
--- a/mobile/src/framework/executor.h
+++ b/mobile/src/framework/executor.h
--- a/mobile/src/framework/framework.pb-c.cpp
+++ b/mobile/src/framework/framework.pb-c.cpp
--- a/mobile/src/framework/framework.pb-c.h
+++ b/mobile/src/framework/framework.pb-c.h
--- a/mobile/src/framework/framework.proto
+++ b/mobile/src/framework/framework.proto
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
--- a/mobile/src/framework/loader.h
+++ b/mobile/src/framework/loader.h
--- a/mobile/src/framework/lod_tensor.cpp
+++ b/mobile/src/framework/lod_tensor.cpp
--- a/mobile/src/framework/lod_tensor.h
+++ b/mobile/src/framework/lod_tensor.h
--- a/mobile/src/framework/mixed_vector.h
+++ b/mobile/src/framework/mixed_vector.h
--- a/mobile/src/framework/op_info.h
+++ b/mobile/src/framework/op_info.h
--- a/mobile/src/framework/op_kernel_type.h
+++ b/mobile/src/framework/op_kernel_type.h
--- a/mobile/src/framework/op_proto_maker.h
+++ b/mobile/src/framework/op_proto_maker.h
--- a/mobile/src/framework/op_registry.h
+++ b/mobile/src/framework/op_registry.h
--- a/mobile/src/framework/operator.cpp
+++ b/mobile/src/framework/operator.cpp
--- a/mobile/src/framework/operator.h
+++ b/mobile/src/framework/operator.h
--- a/mobile/src/framework/program/block_desc.cpp
+++ b/mobile/src/framework/program/block_desc.cpp
--- a/mobile/src/framework/program/block_desc.h
+++ b/mobile/src/framework/program/block_desc.h
--- a/mobile/src/framework/program/op_desc.cpp
+++ b/mobile/src/framework/program/op_desc.cpp
--- a/mobile/src/framework/program/op_desc.h
+++ b/mobile/src/framework/program/op_desc.h
--- a/mobile/src/framework/program/program-optimize/fusion_op_register.h
+++ b/mobile/src/framework/program/program-optimize/fusion_op_register.h
--- a/mobile/src/framework/program/program-optimize/node.cpp
+++ b/mobile/src/framework/program/program-optimize/node.cpp
--- a/mobile/src/framework/program/program-optimize/node.h
+++ b/mobile/src/framework/program/program-optimize/node.h
--- a/mobile/src/framework/program/program-optimize/program_optimize.cpp
+++ b/mobile/src/framework/program/program-optimize/program_optimize.cpp
--- a/mobile/src/framework/program/program-optimize/program_optimize.h
+++ b/mobile/src/framework/program/program-optimize/program_optimize.h
--- a/mobile/src/framework/program/program.h
+++ b/mobile/src/framework/program/program.h
--- a/mobile/src/framework/program/program_desc.cpp
+++ b/mobile/src/framework/program/program_desc.cpp
--- a/mobile/src/framework/program/program_desc.h
+++ b/mobile/src/framework/program/program_desc.h
--- a/mobile/src/framework/program/tensor_desc.h
+++ b/mobile/src/framework/program/tensor_desc.h
--- a/mobile/src/framework/program/var_desc.h
+++ b/mobile/src/framework/program/var_desc.h
--- a/mobile/src/framework/scope.cpp
+++ b/mobile/src/framework/scope.cpp
--- a/mobile/src/framework/scope.h
+++ b/mobile/src/framework/scope.h
--- a/mobile/src/framework/selected_rows.cpp
+++ b/mobile/src/framework/selected_rows.cpp
--- a/mobile/src/framework/selected_rows.h
+++ b/mobile/src/framework/selected_rows.h
--- a/mobile/src/framework/tensor.h
+++ b/mobile/src/framework/tensor.h
--- a/mobile/src/framework/tensor_base.h
+++ b/mobile/src/framework/tensor_base.h
--- a/mobile/src/framework/tensor_util.cpp
+++ b/mobile/src/framework/tensor_util.cpp
--- a/mobile/src/framework/tensor_util.h
+++ b/mobile/src/framework/tensor_util.h
--- a/mobile/src/framework/type_trait.h
+++ b/mobile/src/framework/type_trait.h
--- a/mobile/src/framework/variable.h
+++ b/mobile/src/framework/variable.h
--- a/mobile/src/framework/zynqmp/ztensor.hpp
+++ b/mobile/src/framework/zynqmp/ztensor.hpp
--- a/mobile/src/io/api.cc
+++ b/mobile/src/io/api.cc
--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
--- a/mobile/src/io/api_paddle_mobile.h
+++ b/mobile/src/io/api_paddle_mobile.h
--- a/mobile/src/io/ios_io/PaddleMobileCPU.h
+++ b/mobile/src/io/ios_io/PaddleMobileCPU.h
--- a/mobile/src/io/ios_io/PaddleMobileCPU.mm
+++ b/mobile/src/io/ios_io/PaddleMobileCPU.mm
--- a/mobile/src/io/jni/PML.java
+++ b/mobile/src/io/jni/PML.java
--- a/mobile/src/io/jni/paddle_mobile_jni.cpp
+++ b/mobile/src/io/jni/paddle_mobile_jni.cpp
--- a/mobile/src/io/jni/paddle_mobile_jni.h
+++ b/mobile/src/io/jni/paddle_mobile_jni.h
--- a/mobile/src/io/loader.h
+++ b/mobile/src/io/loader.h
--- a/mobile/src/io/opencl_interface.cpp
+++ b/mobile/src/io/opencl_interface.cpp
--- a/mobile/src/io/opencl_interface.h
+++ b/mobile/src/io/opencl_interface.h
--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
--- a/mobile/src/io/paddle_mobile.cpp
+++ b/mobile/src/io/paddle_mobile.cpp
--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
--- a/mobile/src/io/paddle_mobile_wrap.cpp
+++ b/mobile/src/io/paddle_mobile_wrap.cpp
--- a/mobile/src/io/paddle_mobile_wrap.h
+++ b/mobile/src/io/paddle_mobile_wrap.h
--- a/mobile/src/io/paddle_test_inference_api.cpp
+++ b/mobile/src/io/paddle_test_inference_api.cpp
--- a/mobile/src/io/paddle_test_inference_api.h
+++ b/mobile/src/io/paddle_test_inference_api.h
--- a/mobile/src/memory/t_malloc.cpp
+++ b/mobile/src/memory/t_malloc.cpp
--- a/mobile/src/memory/t_malloc.h
+++ b/mobile/src/memory/t_malloc.h
--- a/mobile/src/operators/activation_op.cpp
+++ b/mobile/src/operators/activation_op.cpp
--- a/mobile/src/operators/activation_op.h
+++ b/mobile/src/operators/activation_op.h
--- a/mobile/src/operators/assign_op.cpp
+++ b/mobile/src/operators/assign_op.cpp
--- a/mobile/src/operators/assign_op.h
+++ b/mobile/src/operators/assign_op.h
--- a/mobile/src/operators/assign_value_op.cpp
+++ b/mobile/src/operators/assign_value_op.cpp
--- a/mobile/src/operators/assign_value_op.h
+++ b/mobile/src/operators/assign_value_op.h
--- a/mobile/src/operators/batchnorm_op.cpp
+++ b/mobile/src/operators/batchnorm_op.cpp
--- a/mobile/src/operators/batchnorm_op.h
+++ b/mobile/src/operators/batchnorm_op.h
--- a/mobile/src/operators/beam_search_decode_op.cpp
+++ b/mobile/src/operators/beam_search_decode_op.cpp
--- a/mobile/src/operators/beam_search_decode_op.h
+++ b/mobile/src/operators/beam_search_decode_op.h
--- a/mobile/src/operators/beam_search_op.cpp
+++ b/mobile/src/operators/beam_search_op.cpp
--- a/mobile/src/operators/beam_search_op.h
+++ b/mobile/src/operators/beam_search_op.h
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ b/mobile/src/operators/bilinear_interp_op.cpp
--- a/mobile/src/operators/bilinear_interp_op.h
+++ b/mobile/src/operators/bilinear_interp_op.h
--- a/mobile/src/operators/box_coder_op.cpp
+++ b/mobile/src/operators/box_coder_op.cpp
--- a/mobile/src/operators/box_coder_op.h
+++ b/mobile/src/operators/box_coder_op.h
--- a/mobile/src/operators/cast_op.cpp
+++ b/mobile/src/operators/cast_op.cpp
--- a/mobile/src/operators/cast_op.h
+++ b/mobile/src/operators/cast_op.h
--- a/mobile/src/operators/compare_op.cpp
+++ b/mobile/src/operators/compare_op.cpp
--- a/mobile/src/operators/compare_op.h
+++ b/mobile/src/operators/compare_op.h
--- a/mobile/src/operators/concat_op.cpp
+++ b/mobile/src/operators/concat_op.cpp
--- a/mobile/src/operators/concat_op.h
+++ b/mobile/src/operators/concat_op.h
--- a/mobile/src/operators/conditional_block_op.cpp
+++ b/mobile/src/operators/conditional_block_op.cpp
--- a/mobile/src/operators/conditional_block_op.h
+++ b/mobile/src/operators/conditional_block_op.h
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
+++ b/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.h
+++ b/mobile/src/operators/controlflow/tensor_array_read_write_op.h
--- a/mobile/src/operators/controlflow/while_op.cpp
+++ b/mobile/src/operators/controlflow/while_op.cpp
--- a/mobile/src/operators/controlflow/while_op.h
+++ b/mobile/src/operators/controlflow/while_op.h
--- a/mobile/src/operators/conv_op.cpp
+++ b/mobile/src/operators/conv_op.cpp
--- a/mobile/src/operators/conv_op.h
+++ b/mobile/src/operators/conv_op.h
--- a/mobile/src/operators/conv_transpose_op.cpp
+++ b/mobile/src/operators/conv_transpose_op.cpp
--- a/mobile/src/operators/conv_transpose_op.h
+++ b/mobile/src/operators/conv_transpose_op.h
--- a/mobile/src/operators/crf_op.cpp
+++ b/mobile/src/operators/crf_op.cpp
--- a/mobile/src/operators/crf_op.h
+++ b/mobile/src/operators/crf_op.h
--- a/mobile/src/operators/depthwise_conv_op.cpp
+++ b/mobile/src/operators/depthwise_conv_op.cpp
--- a/mobile/src/operators/depthwise_conv_op.h
+++ b/mobile/src/operators/depthwise_conv_op.h
--- a/mobile/src/operators/dequantize_op.cpp
+++ b/mobile/src/operators/dequantize_op.cpp
--- a/mobile/src/operators/dequantize_op.h
+++ b/mobile/src/operators/dequantize_op.h
--- a/mobile/src/operators/detection_ops.cpp
+++ b/mobile/src/operators/detection_ops.cpp
--- a/mobile/src/operators/detection_ops.h
+++ b/mobile/src/operators/detection_ops.h
--- a/mobile/src/operators/dropout_op.cpp
+++ b/mobile/src/operators/dropout_op.cpp
--- a/mobile/src/operators/dropout_op.h
+++ b/mobile/src/operators/dropout_op.h
--- a/mobile/src/operators/elementwise_add_op.cpp
+++ b/mobile/src/operators/elementwise_add_op.cpp
--- a/mobile/src/operators/elementwise_add_op.h
+++ b/mobile/src/operators/elementwise_add_op.h
--- a/mobile/src/operators/elementwise_mul_op.cpp
+++ b/mobile/src/operators/elementwise_mul_op.cpp
--- a/mobile/src/operators/elementwise_mul_op.h
+++ b/mobile/src/operators/elementwise_mul_op.h
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
--- a/mobile/src/operators/elementwise_sub_op.h
+++ b/mobile/src/operators/elementwise_sub_op.h
--- a/mobile/src/operators/exp_op.cpp
+++ b/mobile/src/operators/exp_op.cpp
--- a/mobile/src/operators/exp_op.h
+++ b/mobile/src/operators/exp_op.h
--- a/mobile/src/operators/expand_op.cpp
+++ b/mobile/src/operators/expand_op.cpp
--- a/mobile/src/operators/expand_op.h
+++ b/mobile/src/operators/expand_op.h
--- a/mobile/src/operators/feed_op.cpp
+++ b/mobile/src/operators/feed_op.cpp
--- a/mobile/src/operators/feed_op.h
+++ b/mobile/src/operators/feed_op.h
--- a/mobile/src/operators/fetch_op.cpp
+++ b/mobile/src/operators/fetch_op.cpp
--- a/mobile/src/operators/fetch_op.h
+++ b/mobile/src/operators/fetch_op.h
--- a/mobile/src/operators/fill_constant_batch_size_like_op.cpp
+++ b/mobile/src/operators/fill_constant_batch_size_like_op.cpp
--- a/mobile/src/operators/fill_constant_batch_size_like_op.h
+++ b/mobile/src/operators/fill_constant_batch_size_like_op.h
--- a/mobile/src/operators/fill_constant_op.cpp
+++ b/mobile/src/operators/fill_constant_op.cpp
--- a/mobile/src/operators/fill_constant_op.h
+++ b/mobile/src/operators/fill_constant_op.h
--- a/mobile/src/operators/flatten2_op.cpp
+++ b/mobile/src/operators/flatten2_op.cpp
--- a/mobile/src/operators/flatten2_op.h
+++ b/mobile/src/operators/flatten2_op.h
--- a/mobile/src/operators/flatten_op.cpp
+++ b/mobile/src/operators/flatten_op.cpp
--- a/mobile/src/operators/flatten_op.h
+++ b/mobile/src/operators/flatten_op.h
--- a/mobile/src/operators/fusion_conv_add_bn_op.cpp
+++ b/mobile/src/operators/fusion_conv_add_bn_op.cpp
--- a/mobile/src/operators/fusion_conv_add_bn_op.h
+++ b/mobile/src/operators/fusion_conv_add_bn_op.h
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/mobile/src/operators/fusion_conv_add_bn_relu_op.h
--- a/mobile/src/operators/fusion_conv_add_op.cpp
+++ b/mobile/src/operators/fusion_conv_add_op.cpp
--- a/mobile/src/operators/fusion_conv_add_op.h
+++ b/mobile/src/operators/fusion_conv_add_op.h
--- a/mobile/src/operators/fusion_conv_add_relu_op.cpp
+++ b/mobile/src/operators/fusion_conv_add_relu_op.cpp
--- a/mobile/src/operators/fusion_conv_add_relu_op.h
+++ b/mobile/src/operators/fusion_conv_add_relu_op.h
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/mobile/src/operators/fusion_conv_bn_add_relu_op.h
--- a/mobile/src/operators/fusion_conv_bn_op.cpp
+++ b/mobile/src/operators/fusion_conv_bn_op.cpp
--- a/mobile/src/operators/fusion_conv_bn_op.h
+++ b/mobile/src/operators/fusion_conv_bn_op.h
--- a/mobile/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_conv_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_conv_bn_relu_op.h
+++ b/mobile/src/operators/fusion_conv_bn_relu_op.h
--- a/mobile/src/operators/fusion_conv_relu_op.cpp
+++ b/mobile/src/operators/fusion_conv_relu_op.cpp
--- a/mobile/src/operators/fusion_conv_relu_op.h
+++ b/mobile/src/operators/fusion_conv_relu_op.h
--- a/mobile/src/operators/fusion_deconv_add_bn_op.cpp
+++ b/mobile/src/operators/fusion_deconv_add_bn_op.cpp
--- a/mobile/src/operators/fusion_deconv_add_bn_op.h
+++ b/mobile/src/operators/fusion_deconv_add_bn_op.h
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
+++ b/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
--- a/mobile/src/operators/fusion_deconv_add_op.cpp
+++ b/mobile/src/operators/fusion_deconv_add_op.cpp
--- a/mobile/src/operators/fusion_deconv_add_op.h
+++ b/mobile/src/operators/fusion_deconv_add_op.h
--- a/mobile/src/operators/fusion_deconv_add_relu_op.cpp
+++ b/mobile/src/operators/fusion_deconv_add_relu_op.cpp
--- a/mobile/src/operators/fusion_deconv_add_relu_op.h
+++ b/mobile/src/operators/fusion_deconv_add_relu_op.h
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.h
+++ b/mobile/src/operators/fusion_deconv_bn_relu_op.h
--- a/mobile/src/operators/fusion_deconv_relu_op.cpp
+++ b/mobile/src/operators/fusion_deconv_relu_op.cpp
--- a/mobile/src/operators/fusion_deconv_relu_op.h
+++ b/mobile/src/operators/fusion_deconv_relu_op.h
--- a/mobile/src/operators/fusion_dequant_add_bn_op.cpp
+++ b/mobile/src/operators/fusion_dequant_add_bn_op.cpp
--- a/mobile/src/operators/fusion_dequant_add_bn_op.h
+++ b/mobile/src/operators/fusion_dequant_add_bn_op.h
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
+++ b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
--- a/mobile/src/operators/fusion_dequant_bn_op.cpp
+++ b/mobile/src/operators/fusion_dequant_bn_op.cpp
--- a/mobile/src/operators/fusion_dequant_bn_op.h
+++ b/mobile/src/operators/fusion_dequant_bn_op.h
--- a/mobile/src/operators/fusion_dequant_bn_relu_op.h
+++ b/mobile/src/operators/fusion_dequant_bn_relu_op.h
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/mobile/src/operators/fusion_dwconv_bn_relu_op.h
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.h
+++ b/mobile/src/operators/fusion_elementwise_add_relu_op.h
--- a/mobile/src/operators/fusion_fc_op.cpp
+++ b/mobile/src/operators/fusion_fc_op.cpp
--- a/mobile/src/operators/fusion_fc_op.h
+++ b/mobile/src/operators/fusion_fc_op.h
--- a/mobile/src/operators/fusion_fc_relu_op.cpp
+++ b/mobile/src/operators/fusion_fc_relu_op.cpp
--- a/mobile/src/operators/fusion_fc_relu_op.h
+++ b/mobile/src/operators/fusion_fc_relu_op.h
--- a/mobile/src/operators/fusion_instancenorm_relu_op.cpp
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.cpp
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
--- a/mobile/src/operators/grid_sampler_op.cpp
+++ b/mobile/src/operators/grid_sampler_op.cpp
--- a/mobile/src/operators/grid_sampler_op.h
+++ b/mobile/src/operators/grid_sampler_op.h
--- a/mobile/src/operators/gru_op.cpp
+++ b/mobile/src/operators/gru_op.cpp
--- a/mobile/src/operators/gru_op.h
+++ b/mobile/src/operators/gru_op.h
--- a/mobile/src/operators/gru_unit_op.cpp
+++ b/mobile/src/operators/gru_unit_op.cpp
--- a/mobile/src/operators/gru_unit_op.h
+++ b/mobile/src/operators/gru_unit_op.h
--- a/mobile/src/operators/im2sequence_op.cpp
+++ b/mobile/src/operators/im2sequence_op.cpp
--- a/mobile/src/operators/im2sequence_op.h
+++ b/mobile/src/operators/im2sequence_op.h
--- a/mobile/src/operators/increment_op.cpp
+++ b/mobile/src/operators/increment_op.cpp
--- a/mobile/src/operators/increment_op.h
+++ b/mobile/src/operators/increment_op.h
--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
--- a/mobile/src/operators/instancenorm_op.h
+++ b/mobile/src/operators/instancenorm_op.h
--- a/mobile/src/operators/is_empty_op.cpp
+++ b/mobile/src/operators/is_empty_op.cpp
--- a/mobile/src/operators/is_empty_op.h
+++ b/mobile/src/operators/is_empty_op.h
--- a/mobile/src/operators/kernel/activation_kernel.h
+++ b/mobile/src/operators/kernel/activation_kernel.h
--- a/mobile/src/operators/kernel/arm/activation_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/activation_kernel.cpp
--- a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
--- a/mobile/src/operators/kernel/arm/assign_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/assign_kernel.cpp
--- a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
--- a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
--- a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
--- a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
--- a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
--- a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
--- a/mobile/src/operators/kernel/arm/cast_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/cast_kernel.cpp
--- a/mobile/src/operators/kernel/arm/compare_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/compare_kernel.cpp
--- a/mobile/src/operators/kernel/arm/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/concat_kernel.cpp
--- a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.h
+++ b/mobile/src/operators/kernel/arm/convolution/conv_common.h
--- a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
--- a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/crf_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/crf_kernel.cpp
--- a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
--- a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
--- a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
--- a/mobile/src/operators/kernel/arm/dropout_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/dropout_kernel.cpp
--- a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
--- a/mobile/src/operators/kernel/arm/exp_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/exp_kernel.cpp
--- a/mobile/src/operators/kernel/arm/feed_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/feed_kernel.cpp
--- a/mobile/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/fetch_kernel.cpp
--- a/mobile/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/flatten_kernel.cpp
--- a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
--- a/mobile/src/operators/kernel/arm/gru_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/gru_kernel.cpp
--- a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
--- a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
--- a/mobile/src/operators/kernel/arm/increment_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/increment_kernel.cpp
--- a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
--- a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
--- a/mobile/src/operators/kernel/arm/logical_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/logical_kernel.cpp
--- a/mobile/src/operators/kernel/arm/lookup_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/lookup_kernel.cpp
--- a/mobile/src/operators/kernel/arm/lrn_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/lrn_kernel.cpp
--- a/mobile/src/operators/kernel/arm/mul_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/mul_kernel.cpp
--- a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
--- a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
--- a/mobile/src/operators/kernel/arm/norm_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/norm_kernel.cpp
--- a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
--- a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
--- a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
--- a/mobile/src/operators/kernel/arm/pool_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/pool_kernel.cpp
--- a/mobile/src/operators/kernel/arm/prelu_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/prelu_kernel.cpp
--- a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
--- a/mobile/src/operators/kernel/arm/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/proposal_kernel.cpp
--- a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
--- a/mobile/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/quantize_kernel.cpp
--- a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/arm/reshape_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/reshape_kernel.cpp
--- a/mobile/src/operators/kernel/arm/resize_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/resize_kernel.cpp
--- a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
--- a/mobile/src/operators/kernel/arm/scale_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/scale_kernel.cpp
--- a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
--- a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
--- a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
--- a/mobile/src/operators/kernel/arm/shape_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/shape_kernel.cpp
--- a/mobile/src/operators/kernel/arm/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/slice_kernel.cpp
--- a/mobile/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/softmax_kernel.cpp
--- a/mobile/src/operators/kernel/arm/split_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/split_kernel.cpp
--- a/mobile/src/operators/kernel/arm/sum_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/sum_kernel.cpp
--- a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
--- a/mobile/src/operators/kernel/arm/top_k_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/top_k_kernel.cpp
--- a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
--- a/mobile/src/operators/kernel/arm/transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/transpose_kernel.cpp
--- a/mobile/src/operators/kernel/arm/while_kernel.cpp
+++ b/mobile/src/operators/kernel/arm/while_kernel.cpp
--- a/mobile/src/operators/kernel/assign_kernel.h
+++ b/mobile/src/operators/kernel/assign_kernel.h
--- a/mobile/src/operators/kernel/assign_value_kernel.h
+++ b/mobile/src/operators/kernel/assign_value_kernel.h
--- a/mobile/src/operators/kernel/batchnorm_kernel.h
+++ b/mobile/src/operators/kernel/batchnorm_kernel.h
--- a/mobile/src/operators/kernel/beam_search_decode_kernel.h
+++ b/mobile/src/operators/kernel/beam_search_decode_kernel.h
--- a/mobile/src/operators/kernel/beam_search_kernel.h
+++ b/mobile/src/operators/kernel/beam_search_kernel.h
--- a/mobile/src/operators/kernel/bilinear_interp_kernel.h
+++ b/mobile/src/operators/kernel/bilinear_interp_kernel.h
--- a/mobile/src/operators/kernel/box_coder_kernel.h
+++ b/mobile/src/operators/kernel/box_coder_kernel.h
--- a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
--- a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ b/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
--- a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
--- a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ b/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
--- a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
--- a/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
--- a/mobile/src/operators/kernel/cl/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/concat_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
--- a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
--- a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
--- a/mobile/src/operators/kernel/cl/dropout_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/dropout_kernel.cpp
--- a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
--- a/mobile/src/operators/kernel/cl/exp_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/exp_kernel.cpp
--- a/mobile/src/operators/kernel/cl/expand_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/expand_kernel.cpp
--- a/mobile/src/operators/kernel/cl/feed_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/feed_kernel.cpp
--- a/mobile/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/fetch_kernel.cpp
--- a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
--- a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
--- a/mobile/src/operators/kernel/cl/gen_code.py
+++ b/mobile/src/operators/kernel/cl/gen_code.py
--- a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/lrn_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/lrn_kernel.cpp
--- a/mobile/src/operators/kernel/cl/mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/mul_kernel.cpp
--- a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
--- a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
--- a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
--- a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
--- a/mobile/src/operators/kernel/cl/pool_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/pool_kernel.cpp
--- a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
--- a/mobile/src/operators/kernel/cl/relu6_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/relu6_kernel.cpp
--- a/mobile/src/operators/kernel/cl/relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/relu_kernel.cpp
--- a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/reshape_kernel.cpp
--- a/mobile/src/operators/kernel/cl/scale_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/scale_kernel.cpp
--- a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
--- a/mobile/src/operators/kernel/cl/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/slice_kernel.cpp
--- a/mobile/src/operators/kernel/cl/softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/softmax_kernel.cpp
--- a/mobile/src/operators/kernel/cl/split_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/split_kernel.cpp
--- a/mobile/src/operators/kernel/cl/tanh_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/tanh_kernel.cpp
--- a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
--- a/mobile/src/operators/kernel/cl/transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/transpose_kernel.cpp
--- a/mobile/src/operators/kernel/compare_kernel.h
+++ b/mobile/src/operators/kernel/compare_kernel.h
--- a/mobile/src/operators/kernel/concat_kernel.h
+++ b/mobile/src/operators/kernel/concat_kernel.h
--- a/mobile/src/operators/kernel/conditional_block_kernel.h
+++ b/mobile/src/operators/kernel/conditional_block_kernel.h
--- a/mobile/src/operators/kernel/conv_add_bn_kernel.h
+++ b/mobile/src/operators/kernel/conv_add_bn_kernel.h
--- a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
--- a/mobile/src/operators/kernel/conv_add_kernel.h
+++ b/mobile/src/operators/kernel/conv_add_kernel.h
--- a/mobile/src/operators/kernel/conv_add_relu_kernel.h
+++ b/mobile/src/operators/kernel/conv_add_relu_kernel.h
--- a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
--- a/mobile/src/operators/kernel/conv_bn_kernel.h
+++ b/mobile/src/operators/kernel/conv_bn_kernel.h
--- a/mobile/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/mobile/src/operators/kernel/conv_bn_relu_kernel.h
--- a/mobile/src/operators/kernel/conv_kernel.h
+++ b/mobile/src/operators/kernel/conv_kernel.h
--- a/mobile/src/operators/kernel/conv_relu_kernel.h
+++ b/mobile/src/operators/kernel/conv_relu_kernel.h
--- a/mobile/src/operators/kernel/conv_transpose_kernel.h
+++ b/mobile/src/operators/kernel/conv_transpose_kernel.h
--- a/mobile/src/operators/kernel/crf_kernel.h
+++ b/mobile/src/operators/kernel/crf_kernel.h
--- a/mobile/src/operators/kernel/deconv_add_bn_kernel.h
+++ b/mobile/src/operators/kernel/deconv_add_bn_kernel.h
--- a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
+++ b/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
--- a/mobile/src/operators/kernel/deconv_add_kernel.h
+++ b/mobile/src/operators/kernel/deconv_add_kernel.h
--- a/mobile/src/operators/kernel/deconv_add_relu_kernel.h
+++ b/mobile/src/operators/kernel/deconv_add_relu_kernel.h
--- a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
+++ b/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
--- a/mobile/src/operators/kernel/deconv_relu_kernel.h
+++ b/mobile/src/operators/kernel/deconv_relu_kernel.h
--- a/mobile/src/operators/kernel/dequant_bn_kernel.h
+++ b/mobile/src/operators/kernel/dequant_bn_kernel.h
--- a/mobile/src/operators/kernel/dequantize_kernel.h
+++ b/mobile/src/operators/kernel/dequantize_kernel.h
--- a/mobile/src/operators/kernel/detection_kernel.h
+++ b/mobile/src/operators/kernel/detection_kernel.h
--- a/mobile/src/operators/kernel/dropout_kernel.h
+++ b/mobile/src/operators/kernel/dropout_kernel.h
--- a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
--- a/mobile/src/operators/kernel/elementwise_add_kernel.h
+++ b/mobile/src/operators/kernel/elementwise_add_kernel.h
--- a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
+++ b/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
--- a/mobile/src/operators/kernel/elementwise_mul_kernel.h
+++ b/mobile/src/operators/kernel/elementwise_mul_kernel.h
--- a/mobile/src/operators/kernel/elementwise_sub_kernel.h
+++ b/mobile/src/operators/kernel/elementwise_sub_kernel.h
--- a/mobile/src/operators/kernel/exp_kernel.h
+++ b/mobile/src/operators/kernel/exp_kernel.h
--- a/mobile/src/operators/kernel/expand_kernel.h
+++ b/mobile/src/operators/kernel/expand_kernel.h
--- a/mobile/src/operators/kernel/fc_relu_kernel.h
+++ b/mobile/src/operators/kernel/fc_relu_kernel.h
--- a/mobile/src/operators/kernel/feed_kernel.h
+++ b/mobile/src/operators/kernel/feed_kernel.h
--- a/mobile/src/operators/kernel/fetch_kernel.h
+++ b/mobile/src/operators/kernel/fetch_kernel.h
--- a/mobile/src/operators/kernel/flatten2_kernel.h
+++ b/mobile/src/operators/kernel/flatten2_kernel.h
--- a/mobile/src/operators/kernel/flatten_kernel.h
+++ b/mobile/src/operators/kernel/flatten_kernel.h
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
--- a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
--- a/mobile/src/operators/kernel/fusion_fc_kernel.h
+++ b/mobile/src/operators/kernel/fusion_fc_kernel.h
--- a/mobile/src/operators/kernel/grid_sampler_kernel.h
+++ b/mobile/src/operators/kernel/grid_sampler_kernel.h
--- a/mobile/src/operators/kernel/gru_kernel.h
+++ b/mobile/src/operators/kernel/gru_kernel.h
--- a/mobile/src/operators/kernel/gru_unit_kernel.h
+++ b/mobile/src/operators/kernel/gru_unit_kernel.h
--- a/mobile/src/operators/kernel/im2sequence_kernel.h
+++ b/mobile/src/operators/kernel/im2sequence_kernel.h
--- a/mobile/src/operators/kernel/increment_kernel.h
+++ b/mobile/src/operators/kernel/increment_kernel.h
--- a/mobile/src/operators/kernel/instancenorm_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_kernel.h
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
--- a/mobile/src/operators/kernel/is_empty_kernel.h
+++ b/mobile/src/operators/kernel/is_empty_kernel.h
--- a/mobile/src/operators/kernel/kernels.h
+++ b/mobile/src/operators/kernel/kernels.h
--- a/mobile/src/operators/kernel/logical_kernel.h
+++ b/mobile/src/operators/kernel/logical_kernel.h
--- a/mobile/src/operators/kernel/lookup_kernel.h
+++ b/mobile/src/operators/kernel/lookup_kernel.h
--- a/mobile/src/operators/kernel/lrn_kernel.h
+++ b/mobile/src/operators/kernel/lrn_kernel.h
--- a/mobile/src/operators/kernel/mul_kernel.h
+++ b/mobile/src/operators/kernel/mul_kernel.h
--- a/mobile/src/operators/kernel/multiclass_nms_kernel.h
+++ b/mobile/src/operators/kernel/multiclass_nms_kernel.h
--- a/mobile/src/operators/kernel/nearest_interp_kernel.h
+++ b/mobile/src/operators/kernel/nearest_interp_kernel.h
--- a/mobile/src/operators/kernel/norm_kernel.h
+++ b/mobile/src/operators/kernel/norm_kernel.h
--- a/mobile/src/operators/kernel/one_hot_kernel.h
+++ b/mobile/src/operators/kernel/one_hot_kernel.h
--- a/mobile/src/operators/kernel/pad2d_kernel.h
+++ b/mobile/src/operators/kernel/pad2d_kernel.h
--- a/mobile/src/operators/kernel/pixel_shuffle_kernel.h
+++ b/mobile/src/operators/kernel/pixel_shuffle_kernel.h
--- a/mobile/src/operators/kernel/polygon_box_transform_kernel.h
+++ b/mobile/src/operators/kernel/polygon_box_transform_kernel.h
--- a/mobile/src/operators/kernel/pool_kernel.h
+++ b/mobile/src/operators/kernel/pool_kernel.h
--- a/mobile/src/operators/kernel/prelu_kernel.h
+++ b/mobile/src/operators/kernel/prelu_kernel.h
--- a/mobile/src/operators/kernel/prior_box_kernel.h
+++ b/mobile/src/operators/kernel/prior_box_kernel.h
--- a/mobile/src/operators/kernel/quantize_kernel.h
+++ b/mobile/src/operators/kernel/quantize_kernel.h
--- a/mobile/src/operators/kernel/range_kernel.cpp
+++ b/mobile/src/operators/kernel/range_kernel.cpp
--- a/mobile/src/operators/kernel/range_kernel.h
+++ b/mobile/src/operators/kernel/range_kernel.h
--- a/mobile/src/operators/kernel/reduce_prod_kernel.cpp
+++ b/mobile/src/operators/kernel/reduce_prod_kernel.cpp
--- a/mobile/src/operators/kernel/reduce_prod_kernel.h
+++ b/mobile/src/operators/kernel/reduce_prod_kernel.h
--- a/mobile/src/operators/kernel/reshape2_kernel.h
+++ b/mobile/src/operators/kernel/reshape2_kernel.h
--- a/mobile/src/operators/kernel/reshape_kernel.h
+++ b/mobile/src/operators/kernel/reshape_kernel.h
--- a/mobile/src/operators/kernel/resize_kernel.h
+++ b/mobile/src/operators/kernel/resize_kernel.h
--- a/mobile/src/operators/kernel/scale_kernel.h
+++ b/mobile/src/operators/kernel/scale_kernel.h
--- a/mobile/src/operators/kernel/sequence_kernels.h
+++ b/mobile/src/operators/kernel/sequence_kernels.h
--- a/mobile/src/operators/kernel/shape_kernel.h
+++ b/mobile/src/operators/kernel/shape_kernel.h
--- a/mobile/src/operators/kernel/slice_kernel.h
+++ b/mobile/src/operators/kernel/slice_kernel.h
--- a/mobile/src/operators/kernel/softmax_kernel.h
+++ b/mobile/src/operators/kernel/softmax_kernel.h
--- a/mobile/src/operators/kernel/split_kernel.h
+++ b/mobile/src/operators/kernel/split_kernel.h
--- a/mobile/src/operators/kernel/sum_kernel.h
+++ b/mobile/src/operators/kernel/sum_kernel.h
--- a/mobile/src/operators/kernel/tanh_kernel.h
+++ b/mobile/src/operators/kernel/tanh_kernel.h
--- a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
+++ b/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
--- a/mobile/src/operators/kernel/transpose2_kernel.h
+++ b/mobile/src/operators/kernel/transpose2_kernel.h
--- a/mobile/src/operators/kernel/transpose_kernel.h
+++ b/mobile/src/operators/kernel/transpose_kernel.h
--- a/mobile/src/operators/kernel/while_kernel.h
+++ b/mobile/src/operators/kernel/while_kernel.h
--- a/mobile/src/operators/lod_reset_op.cpp
+++ b/mobile/src/operators/lod_reset_op.cpp
--- a/mobile/src/operators/lod_reset_op.h
+++ b/mobile/src/operators/lod_reset_op.h
--- a/mobile/src/operators/logical_op.cpp
+++ b/mobile/src/operators/logical_op.cpp
--- a/mobile/src/operators/logical_op.h
+++ b/mobile/src/operators/logical_op.h
--- a/mobile/src/operators/lookup_op.cpp
+++ b/mobile/src/operators/lookup_op.cpp
--- a/mobile/src/operators/lookup_op.h
+++ b/mobile/src/operators/lookup_op.h
--- a/mobile/src/operators/lrn_op.cpp
+++ b/mobile/src/operators/lrn_op.cpp
--- a/mobile/src/operators/lrn_op.h
+++ b/mobile/src/operators/lrn_op.h
--- a/mobile/src/operators/math/activation.h
+++ b/mobile/src/operators/math/activation.h
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
+++ b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
+++ b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3.cpp
--- a/mobile/src/operators/math/depthwise_conv3x3.h
+++ b/mobile/src/operators/math/depthwise_conv3x3.h
--- a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
--- a/mobile/src/operators/math/depthwise_conv5x5.cpp
+++ b/mobile/src/operators/math/depthwise_conv5x5.cpp
--- a/mobile/src/operators/math/depthwise_conv5x5.h
+++ b/mobile/src/operators/math/depthwise_conv5x5.h
--- a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
+++ b/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
--- a/mobile/src/operators/math/element_wise.h
+++ b/mobile/src/operators/math/element_wise.h
--- a/mobile/src/operators/math/elementwise_op_function.h
+++ b/mobile/src/operators/math/elementwise_op_function.h
--- a/mobile/src/operators/math/gemm.cpp
+++ b/mobile/src/operators/math/gemm.cpp
--- a/mobile/src/operators/math/gemm.h
+++ b/mobile/src/operators/math/gemm.h
--- a/mobile/src/operators/math/gemm/cblas.cc
+++ b/mobile/src/operators/math/gemm/cblas.cc
--- a/mobile/src/operators/math/gemm/cblas.h
+++ b/mobile/src/operators/math/gemm/cblas.h
--- a/mobile/src/operators/math/gemm/executor.h
+++ b/mobile/src/operators/math/gemm/executor.h
--- a/mobile/src/operators/math/gemm/gemm1x1s1.cpp
+++ b/mobile/src/operators/math/gemm/gemm1x1s1.cpp
--- a/mobile/src/operators/math/gemm/gemm1x1s1.h
+++ b/mobile/src/operators/math/gemm/gemm1x1s1.h
--- a/mobile/src/operators/math/gemm/gemm_kernel.h
+++ b/mobile/src/operators/math/gemm/gemm_kernel.h
--- a/mobile/src/operators/math/gemm/pack_kernel.h
+++ b/mobile/src/operators/math/gemm/pack_kernel.h
--- a/mobile/src/operators/math/gemm/strategy.h
+++ b/mobile/src/operators/math/gemm/strategy.h
--- a/mobile/src/operators/math/gemm_int8.cpp
+++ b/mobile/src/operators/math/gemm_int8.cpp
--- a/mobile/src/operators/math/gemm_omp_int8.cpp
+++ b/mobile/src/operators/math/gemm_omp_int8.cpp
--- a/mobile/src/operators/math/gpc.cpp
+++ b/mobile/src/operators/math/gpc.cpp
--- a/mobile/src/operators/math/gpc.h
+++ b/mobile/src/operators/math/gpc.h
--- a/mobile/src/operators/math/gru_compute.cpp
+++ b/mobile/src/operators/math/gru_compute.cpp
--- a/mobile/src/operators/math/gru_compute.h
+++ b/mobile/src/operators/math/gru_compute.h
--- a/mobile/src/operators/math/gru_cpu_kernel.h
+++ b/mobile/src/operators/math/gru_cpu_kernel.h
--- a/mobile/src/operators/math/im2col.cpp
+++ b/mobile/src/operators/math/im2col.cpp
--- a/mobile/src/operators/math/im2col.h
+++ b/mobile/src/operators/math/im2col.h
--- a/mobile/src/operators/math/math.h
+++ b/mobile/src/operators/math/math.h
--- a/mobile/src/operators/math/math_function.cpp
+++ b/mobile/src/operators/math/math_function.cpp
--- a/mobile/src/operators/math/math_function.h
+++ b/mobile/src/operators/math/math_function.h
--- a/mobile/src/operators/math/math_function_int8.cpp
+++ b/mobile/src/operators/math/math_function_int8.cpp
--- a/mobile/src/operators/math/pad.cpp
+++ b/mobile/src/operators/math/pad.cpp
--- a/mobile/src/operators/math/pad.h
+++ b/mobile/src/operators/math/pad.h
--- a/mobile/src/operators/math/poly_util.cpp
+++ b/mobile/src/operators/math/poly_util.cpp
--- a/mobile/src/operators/math/poly_util.h
+++ b/mobile/src/operators/math/poly_util.h
--- a/mobile/src/operators/math/pooling.cpp
+++ b/mobile/src/operators/math/pooling.cpp
--- a/mobile/src/operators/math/pooling.h
+++ b/mobile/src/operators/math/pooling.h
--- a/mobile/src/operators/math/pooling2x2.cpp
+++ b/mobile/src/operators/math/pooling2x2.cpp
--- a/mobile/src/operators/math/pooling3x3.cpp
+++ b/mobile/src/operators/math/pooling3x3.cpp
--- a/mobile/src/operators/math/quantize.h
+++ b/mobile/src/operators/math/quantize.h
--- a/mobile/src/operators/math/selected_rows_functor.h
+++ b/mobile/src/operators/math/selected_rows_functor.h
--- a/mobile/src/operators/math/sequence2batch.cpp
+++ b/mobile/src/operators/math/sequence2batch.cpp